Mesa (master): broadcom/compiler: try to fill up delay slots after unconditional branch

GitLab Mirror Tue, 30 Mar 2021 23:09:32 -0700

Module: Mesa
Branch: master
Commit: 8f7640293d8a344d0be91cf91e58cc8899b2b08a
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=8f7640293d8a344d0be91cf91e58cc8899b2b08a


Author: Iago Toral Quiroga <[email protected]>
Date:   Tue Mar 30 10:58:04 2021 +0200

broadcom/compiler: try to fill up delay slots after unconditional branch

If we have an unconditional branch then we can try to fill up its
delay slots with the initial instructions of its successor block by
copying them into the delay slots and adjusting the branch offset to
skip the copied instructions.

total nops in shared programs: 365640 -> 364471 (-0.32%)
nops in affected programs: 15416 -> 14247 (-7.58%)
helped: 462
HURT: 0

Reviewed-by: Alejandro Piñeiro <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9918>

---

 src/broadcom/compiler/qpu_schedule.c | 52 +++++++++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/src/broadcom/compiler/qpu_schedule.c 
b/src/broadcom/compiler/qpu_schedule.c
index 3dd5d246a45..11952ba04ce 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -1846,8 +1846,9 @@ emit_branch(struct v3d_compile *c,
 
         /* Fill any remaining delay slots.
          *
-         * FIXME: For unconditional branches we could fill these with the
-         * first instructions in the successor block.
+         * For unconditional branches we'll try to fill these with the
+         * first instructions in the successor block after scheduling
+         * all blocks when setting up branch targets.
          */
         for (int i = 0; i < 3 - slots_filled; i++)
                 emit_nop(c, block, scoreboard);
@@ -2209,13 +2210,28 @@ qpu_set_branch_targets(struct v3d_compile *c)
                  */
                 struct qinst *branch = NULL;
                 struct list_head *entry = block->instructions.prev;
+                int32_t delay_slot_count = -1;
+                struct qinst *delay_slots_start = NULL;
                 for (int i = 0; i < 3; i++) {
                         entry = entry->prev;
-                        branch = container_of(entry, struct qinst, link);
-                        if (branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
+                        struct qinst *inst =
+                                container_of(entry, struct qinst, link);
+
+                        if (delay_slot_count == -1) {
+                                if (!v3d_qpu_is_nop(&inst->qpu))
+                                        delay_slot_count = i;
+                                else
+                                        delay_slots_start = inst;
+                        }
+
+                        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH) {
+                                branch = inst;
                                 break;
+                        }
                 }
                 assert(branch && branch->qpu.type == 
V3D_QPU_INSTR_TYPE_BRANCH);
+                assert(delay_slot_count >= 0 && delay_slot_count <= 3);
+                assert(delay_slot_count == 0 || delay_slots_start != NULL);
 
                 /* Make sure that the if-we-don't-jump
                  * successor was scheduled just after the
@@ -2241,6 +2257,34 @@ qpu_set_branch_targets(struct v3d_compile *c)
                 c->uniform_data[branch->uniform] =
                         (block->successors[0]->start_uniform -
                          (block->branch_uniform + 1)) * 4;
+
+                /* If this is an unconditional branch, try to fill any 
remaining
+                 * delay slots with the initial instructions of the successor
+                 * block.
+                 *
+                 * FIXME: we can do the same for conditional branches if we
+                 * predicate the instructions to match the branch condition.
+                 */
+                if (branch->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS) {
+                        struct list_head *successor_insts =
+                                &block->successors[0]->instructions;
+                        delay_slot_count = MIN2(delay_slot_count,
+                                                list_length(successor_insts));
+                        struct qinst *s_inst =
+                                (struct qinst *) successor_insts->next;
+                        struct qinst *slot = delay_slots_start;
+                        int slots_filled = 0;
+                        while (slots_filled < delay_slot_count &&
+                               qpu_inst_valid_in_branch_delay_slot(c, s_inst)) 
{
+                                memcpy(&slot->qpu, &s_inst->qpu,
+                                       sizeof(slot->qpu));
+                                s_inst = (struct qinst *) s_inst->link.next;
+                                slot = (struct qinst *) slot->link.next;
+                                slots_filled++;
+                        }
+                        branch->qpu.branch.offset +=
+                                slots_filled * sizeof(uint64_t);
+                }
         }
 }
 

_______________________________________________
mesa-commit mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/mesa-commit

Mesa (master): broadcom/compiler: try to fill up delay slots after unconditional branch

Reply via email to