[patch,avr] Run pass avr-fuse-add a second time

Georg-Johann Lay Fri, 30 Aug 2024 05:10:12 -0700

There are cases, where opportunities to use POST_INC addressing
only occur very late in the compilation process.  Take for example
the following function from AVR-LibC's qsort:


void swapfunc (char *a, char *b, int n)
{
    do
    {
        char t = *a;
        *a++ = *b;
        *b++ = t;
    } while (--n > 0);
}


which  -mmcu=avrtiny -S -Os -dp  compiles to:

swapfunc:
        push r28                 ;  72  [c=4 l=1]  pushqi1/0
        push r29                 ;  73  [c=4 l=1]  pushqi1/0
/* prologue: function */
/* frame size = 0 */
/* stack size = 2 */
        mov r26,r24      ;  66  [c=4 l=1]  movqi_insn/0
        mov r27,r25      ;  67  [c=4 l=1]  movqi_insn/0
        mov r30,r22      ;  68  [c=4 l=1]  movqi_insn/0
        mov r31,r23      ;  69  [c=4 l=1]  movqi_insn/0
        mov r22,r20      ;  70  [c=4 l=1]  movqi_insn/0
        mov r23,r21      ;  71  [c=4 l=1]  movqi_insn/0
.L2:
        ld r20,X                 ;  55  [c=4 l=1]  movqi_insn/3
        ld r21,Z                 ;  57  [c=4 l=1]  movqi_insn/3
        st X,r21                 ;  58  [c=4 l=1]  movqi_insn/2
        subi r26,-1      ;  59  [c=4 l=2]  *addhi3_clobber/0
        sbci r27,-1
        st Z,r20                 ;  61  [c=4 l=1]  movqi_insn/2
        subi r30,-1      ;  62  [c=4 l=2]  *addhi3_clobber/0
        sbci r31,-1
        subi r22, 1      ;  81  [c=4 l=2]  *add.for.cczn.hi/0
        sbci r23, 0
        breq .+2         ;  82  [c=8 l=2]  branch_ZN
        brpl .L2
/* epilogue start */
        pop r29          ;  76  [c=4 l=1]  popqi
        pop r28          ;  77  [c=4 l=1]  popqi
        ret              ;  78  [c=0 l=1]  return_from_epilogue

Insn 56+57 and insns 61+62 are post-inc stores.  They are not recognized
because the code prior to cprop_hardreg is a bit of a mess that moves
addresses back and forth (including Y).  Only after cprop_hardreg the
code is simple enough so post-inc can be detected by avr-fuse-add.

Hence this patch runs a 2nd instance of that pass late after
cprop_hardreg (the 1st instance runs prior to RTL peephole).

It renames avr_split_tiny_move to avr_split_fake_addressing_move
because that function also splits some insns on non-avrtiny.

The patch removes a define_split that's no more needed because
such splits are performed by avr_split_fake_addressing_move.

Passed without new regressions.  Ok for trunk?

Johann

p.s. post-inc etc. optimizations is basically non-existent in GCC.
One reasons seems to be SSA because each SSA variable gets its own
register, which results in a jungle of required registers which
even pass auto-inc-dec cannot penetrate...

Take for example the following code, that would require 12 instructions
as indicated by the comments:

void add4 (uint8_t *aa, const __flash uint8_t *bb, uint8_t nn)
{
    // Set Z (R30) to bb (1 MOVW or 2 MOVs)
    // Set X (R26) to aa (1 MOVW or 2 MOVs)
    do
    {
        uint8_t sum = 0;
        sum += *bb++; // 1 instruction: POST_INC load
        sum += *bb++; // 2 instructions: POST_INC load + add
        sum += *bb++; // 2 instructions: POST_INC load + add
        sum += *bb++; // 2 instructions: POST_INC load + add
        *aa++ = sum;  // 1 instruction: POST_INC store
    } while (--nn);   // 2 instructions: dec + branch
}

But  -mmcu=avr4 -Os -S -dp  compiles this to madness that
requires more than a dozen registers and computes each
intermediate in its own 16-bit register, coming out with
code that requires 34 instructions instead of 12.

--

AVR: Run pass avr-fuse-add a second time after pass_cprop_hardreg.

gcc/
        * config/avr/avr-protos.h (avr_split_tiny_move): Rename to
        avr_split_fake_addressing_move.
        * config/avr/avr-passes.cc: Same.
        (avr_pass_data_fuse_add) <tv_id>: Set to TV_MACH_DEP.
        (avr_pass_fuse_add) <clone>: Override.
        * config/avr/avr-passes.def (avr_pass_fuse_add): Run again
        after pass_cprop_hardreg.
        * config/avr/avr.md (split-lpmx): Remove a define_split.  Such
        splits are performed by avr_split_fake_addressing_move.

diff --git a/gcc/config/avr/avr-passes.cc b/gcc/config/avr/avr-passes.cc
index 8b018ff6a05..8a71b57ada1 100644
--- a/gcc/config/avr/avr-passes.cc
+++ b/gcc/config/avr/avr-passes.cc
@@ -1009,7 +1009,7 @@ static const pass_data avr_pass_data_fuse_add =
   RTL_PASS,	    // type
   "",		    // name (will be patched)
   OPTGROUP_NONE,    // optinfo_flags
-  TV_DF_SCAN,	    // tv_id
+  TV_MACH_DEP,	    // tv_id
   0,		    // properties_required
   0,		    // properties_provided
   0,		    // properties_destroyed
@@ -1026,6 +1026,13 @@ public:
     this->name = name;
   }
 
+  // Cloning is required because we are running one instance of the pass
+  // before peephole2. and a second one after cprop_hardreg.
+  opt_pass * clone () final override
+  {
+    return make_avr_pass_fuse_add (m_ctxt);
+  }
+
   bool gate (function *) final override
   {
     return optimize && avr_fuse_add > 0;
@@ -1503,8 +1510,8 @@ avr_pass_fuse_add::fuse_mem_add (Mem_Insn &mem, Add_Insn &add)
    - PLUS insn of that kind.
    - Indirect loads and stores.
    In almost all cases, combine opportunities arise from the preparation
-   done by `avr_split_tiny_move', but in some rare cases combinations are
-   found for the ordinary cores, too.
+   done by `avr_split_fake_addressing_move', but in some rare cases combinations
+   are found for the ordinary cores, too.
       As we consider at most one Mem insn per try, there may still be missed
    optimizations like  POST_INC + PLUS + POST_INC  might be performed
    as  PRE_DEC + PRE_DEC  for two adjacent locations.  */
@@ -1714,7 +1721,7 @@ public:
    core's capabilities.  This sets the stage for pass .avr-fuse-add.  */
 
 bool
-avr_split_tiny_move (rtx_insn * /*insn*/, rtx *xop)
+avr_split_fake_addressing_move (rtx_insn * /*insn*/, rtx *xop)
 {
   bool store_p = false;
   rtx mem, reg_or_0;
diff --git a/gcc/config/avr/avr-passes.def b/gcc/config/avr/avr-passes.def
index 748260edaef..cd89d673727 100644
--- a/gcc/config/avr/avr-passes.def
+++ b/gcc/config/avr/avr-passes.def
@@ -20,12 +20,33 @@
 /* A post reload optimization pass that fuses PLUS insns with CONST_INT
    addend with a load or store insn to get POST_INC or PRE_DEC addressing.
    It can also fuse two PLUSes to a single one, which may occur due to
-   splits from `avr_split_tiny_move'.  We do this in an own pass because
-   it can find more cases than peephole2, for example when there are
-   unrelated insns between the interesting ones.  */
+   splits from `avr_split_fake_addressing_move'.  We do this in an own
+   pass because it can find more cases than peephole2, for example when
+   there are unrelated insns between the interesting ones.  */
 
 INSERT_PASS_BEFORE (pass_peephole2, 1, avr_pass_fuse_add);
 
+/* There are cases where avr-fuse-add doesn't find POST_INC cases because
+   the RTL code at that time is too long-winded, and moves registers back and
+   forth (which seems to be the same reason for why pass auto_inc_dec cannot
+   find POST_INC, either).  Some of that long-windedness is cleaned up very
+   late in pass cprop_hardreg, which opens up new opportunities to find post
+   increments.  An example is the following function from AVR-LibC's qsort:
+
+   void swapfunc (char *a, char *b, int n)
+   {
+       do
+       {
+           char tmp = *a;
+           *a++ = *b;
+           *b++ = tmp;
+       } while (--n > 0);
+   }
+
+   Hence, run avr-fuse-add twice; the second time after cprop_hardreg.  */
+
+INSERT_PASS_AFTER (pass_cprop_hardreg, 1, avr_pass_fuse_add);
+
 /* An analysis pass that runs prior to prologue / epilogue generation.
    Computes cfun->machine->gasisr.maybe which is used in prologue and
    epilogue generation provided -mgas-isr-prologues is on.  */
@@ -47,9 +68,9 @@ INSERT_PASS_BEFORE (pass_free_cfg, 1, avr_pass_recompute_notes);
    tries to fix such situations by operating on the original mode.  This
    reduces code size and register pressure.
 
-   The assertion is that the code generated by casesi is unaltered and a
+   The assertion is that the code generated by casesi is unaltered and
    a sign-extend or zero-extend from QImode or HImode precedes the casesi
-   insns withaout any insns in between.  */
+   insns without any insns in between.  */
 
 INSERT_PASS_AFTER (pass_expand, 1, avr_pass_casesi);
 
diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h
index 289c80cab55..d6f4abbac66 100644
--- a/gcc/config/avr/avr-protos.h
+++ b/gcc/config/avr/avr-protos.h
@@ -179,7 +179,7 @@ extern rtl_opt_pass *make_avr_pass_casesi (gcc::context *);
 extern rtl_opt_pass *make_avr_pass_ifelse (gcc::context *);
 #ifdef RTX_CODE
 extern bool avr_casei_sequence_check_operands (rtx *xop);
-extern bool avr_split_tiny_move (rtx_insn *insn, rtx *operands);
+extern bool avr_split_fake_addressing_move (rtx_insn *insn, rtx *operands);
 #endif /* RTX_CODE */
 
 /* From avr-log.cc */
diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md
index f477ac170ea..520f1fe41a2 100644
--- a/gcc/config/avr/avr.md
+++ b/gcc/config/avr/avr.md
@@ -993,41 +993,10 @@ (define_peephole2
               (clobber (reg:CC REG_CC))])])
 
 
-;; For LPM loads from AS1 we split
-;;    R = *Z
-;; to
-;;    R = *Z++
-;;    Z = Z - sizeof (R)
-;;
-;; so that the second instruction can be optimized out.
-
-(define_split ; "split-lpmx"
-  [(set (match_operand:HISI 0 "register_operand" "")
-        (match_operand:HISI 1 "memory_operand" ""))]
-  "reload_completed
-   && AVR_HAVE_LPMX
-   && avr_mem_flash_p (operands[1])
-   && REG_P (XEXP (operands[1], 0))
-   && !reg_overlap_mentioned_p (XEXP (operands[1], 0), operands[0])"
-  [(set (match_dup 0)
-        (match_dup 2))
-   (set (match_dup 3)
-        (plus:HI (match_dup 3)
-                 (match_dup 4)))]
-  {
-     rtx addr = XEXP (operands[1], 0);
-
-    operands[2] = replace_equiv_address (operands[1],
-                                         gen_rtx_POST_INC (Pmode, addr));
-    operands[3] = addr;
-    operands[4] = gen_int_mode (-<SIZE>, HImode);
-  })
-
-
 ;; Legitimate address and stuff allows way more addressing modes than
 ;; Reduced Tiny actually supports.  Split them now so that we get
 ;; closer to real instructions which may result in some optimization
-;; opportunities.
+;; opportunities.  This applies also to fake X + offset addressing.
 (define_split
   [(parallel [(set (match_operand:MOVMODE 0 "nonimmediate_operand")
                    (match_operand:MOVMODE 1 "general_operand"))
@@ -1040,7 +1009,7 @@ (define_split
    && (MEM_P (operands[0]) || MEM_P (operands[1]))"
   [(scratch)]
   {
-    if (avr_split_tiny_move (curr_insn, operands))
+    if (avr_split_fake_addressing_move (curr_insn, operands))
       DONE;
     FAIL;
   })

[patch,avr] Run pass avr-fuse-add a second time

Reply via email to