I just committed the attached patch to the openacc-gcc-9-branch.
The patch removes the redundant s_waitcnt instruction from store instructions. The s_waitcnt with expcnt was a misunderstanding of the documentation.
Andrew
Backport expcnt patches. 2019-09-05 Andrew Stubbs <a...@codesourcery.com> Backport from mainline: 2019-07-31 Andrew Stubbs <a...@codesourcery.com> gcc/ * config/gcn/gcn-valu.md (scatter<mode>_insn_1offset<exec_scatter>): Remove s_waitcnt. (scatter<mode>_insn_1offset_ds<exec_scatter>): Likewise. (scatter<mode>_insn_2offsets<exec_scatter>): Likewise. * config/gcn/gcn.c (gcn_md_reorg): Add delayeduse and reads to struct ilist. Add nops for delayeduse insns. * config/gcn/gcn.md (delayeduse): New attribute. (*movbi): Remove s_waitcnt from stores. (*mov<mode>_insn): Likewise. (*movti_insn): Likewise. Add delayeduse attribute. (sync_compare_and_swap<mode>_insn): Add delayeduse attribute. (atomic_store<mode>): Remove or adjust s_waitcnt. 2019-09-05 Andrew Stubbs <a...@codesourcery.com> gcc/ * config/gcn/gcn.md (*movti_insn): Set delayeduse for global_store. (sync_compare_and_swap<mode>_insn): Likewise. diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md index 3cc59dd1cd6..3a2287d0ebf 100644 --- a/gcc/config/gcn/gcn-valu.md +++ b/gcc/config/gcn/gcn-valu.md @@ -863,15 +863,12 @@ if (AS_FLAT_P (as)) { if (TARGET_GCN5_PLUS) - sprintf (buf, "flat_store%%s2\t%%0, %%2 offset:%%1%s\;" - "s_waitcnt\texpcnt(0)", glc); + sprintf (buf, "flat_store%%s2\t%%0, %%2 offset:%%1%s", glc); else - sprintf (buf, "flat_store%%s2\t%%0, %%2%s\;s_waitcnt\texpcnt(0)", - glc); + sprintf (buf, "flat_store%%s2\t%%0, %%2%s", glc); } else if (AS_GLOBAL_P (as)) - sprintf (buf, "global_store%%s2\t%%0, %%2, off offset:%%1%s\;" - "s_waitcnt\texpcnt(0)", glc); + sprintf (buf, "global_store%%s2\t%%0, %%2, off offset:%%1%s", glc); else gcc_unreachable (); @@ -895,7 +892,7 @@ { addr_space_t as = INTVAL (operands[3]); static char buf[200]; - sprintf (buf, "ds_write%%b2\t%%0, %%2 offset:%%1%s\;s_waitcnt\texpcnt(0)", + sprintf (buf, "ds_write%%b2\t%%0, %%2 offset:%%1%s", (AS_GDS_P (as) ? " gds" : "")); return buf; } @@ -929,8 +926,8 @@ /* Work around assembler bug in which a 64-bit register is expected, but a 32-bit value would be correct. */ int reg = REGNO (operands[1]) - FIRST_VGPR_REG; - sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s\;" - "s_waitcnt\texpcnt(0)", reg, reg + 1, glc); + sprintf (buf, "global_store%%s3\tv[%d:%d], %%3, %%0 offset:%%2%s", + reg, reg + 1, glc); } else gcc_unreachable (); diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index b7cf6f093fa..66854b6f9c5 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -4509,7 +4509,9 @@ gcn_md_reorg (void) { rtx_insn *insn; attr_unit unit; + attr_delayeduse delayeduse; HARD_REG_SET writes; + HARD_REG_SET reads; int age; } back[max_waits]; int oldest = 0; @@ -4528,6 +4530,7 @@ gcn_md_reorg (void) attr_type itype = get_attr_type (insn); attr_unit iunit = get_attr_unit (insn); + attr_delayeduse idelayeduse = get_attr_delayeduse (insn); HARD_REG_SET ireads, iwrites; CLEAR_HARD_REG_SET (ireads); CLEAR_HARD_REG_SET (iwrites); @@ -4603,6 +4606,14 @@ gcn_md_reorg (void) (regs, reg_class_contents[(int) VGPR_REGS])) nops_rqd = 2 - prev_insn->age; } + + /* Store that requires input registers are not overwritten by + following instruction. */ + if ((prev_insn->age + nops_rqd) < 1 + && prev_insn->delayeduse == DELAYEDUSE_YES + && ((hard_reg_set_intersect_p + (prev_insn->reads, iwrites)))) + nops_rqd = 1 - prev_insn->age; } /* Insert the required number of NOPs. */ @@ -4630,7 +4641,9 @@ gcn_md_reorg (void) /* Track the current instruction as a previous instruction. */ back[oldest].insn = insn; back[oldest].unit = iunit; + back[oldest].delayeduse = idelayeduse; COPY_HARD_REG_SET (back[oldest].writes, iwrites); + COPY_HARD_REG_SET (back[oldest].reads, ireads); back[oldest].age = 0; oldest = (oldest + 1) % max_waits; diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index 2b805a73c56..537bb260ff7 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -285,6 +285,11 @@ (define_attr "laneselect" "yes,no" (const_string "no")) +; Identify instructions that require a "Manually Inserted Wait State" if +; their inputs are overwritten by subsequent instructions. + +(define_attr "delayeduse" "yes,no" (const_string "no")) + ;; }}} ;; {{{ Iterators useful across the wole machine description @@ -475,15 +480,15 @@ case 6: return "s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)"; case 7: - return "s_store_dword\t%1, %A0\;s_waitcnt\texpcnt(0)"; + return "s_store_dword\t%1, %A0"; case 8: return "flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0"; case 9: - return "flat_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"; + return "flat_store_dword\t%A0, %1%O0%g0"; case 10: return "global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)"; case 11: - return "global_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)"; + return "global_store_dword\t%A0, %1%O0%g0"; default: gcc_unreachable (); } @@ -506,20 +511,20 @@ s_movk_i32\t%0, %1 s_mov_b32\t%0, %1 s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0) - s_buffer_store%s1\t%1, s[0:3], %0\;s_waitcnt\texpcnt(0) + s_buffer_store%s1\t%1, s[0:3], %0 s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0) - s_store_dword\t%1, %A0\;s_waitcnt\texpcnt(0) + s_store_dword\t%1, %A0 v_mov_b32\t%0, %1 v_readlane_b32\t%0, %1, 0 v_writelane_b32\t%0, %1, 0 flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0 - flat_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0) + flat_store_dword\t%A0, %1%O0%g0 v_mov_b32\t%0, %1 - ds_write_b32\t%A0, %1%O0\;s_waitcnt\texpcnt(0) + ds_write_b32\t%A0, %1%O0 ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) s_mov_b32\t%0, %1 global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) - global_store_dword\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)" + global_store_dword\t%A0, %1%O0%g0" [(set_attr "type" "sop1,sopk,sop1,smem,smem,smem,smem,vop1,vop3a,vop3a,flat, flat,vop1,ds,ds,sop1,flat,flat") (set_attr "exec" "*,*,*,*,*,*,*,*,none,none,*,*,*,*,*,*,*,*") @@ -541,12 +546,12 @@ v_readlane_b32\t%0, %1, 0 v_writelane_b32\t%0, %1, 0 flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0 - flat_store%s0\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0) + flat_store%s0\t%A0, %1%O0%g0 v_mov_b32\t%0, %1 - ds_write%b0\t%A0, %1%O0\;s_waitcnt\texpcnt(0) + ds_write%b0\t%A0, %1%O0 ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) - global_store%s0\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)" + global_store%s0\t%A0, %1%O0%g0" [(set_attr "type" "sop1,sopk,sop1,vop1,vop3a,vop3a,flat,flat,vop1,ds,ds,flat,flat") (set_attr "exec" "*,*,*,*,none,none,*,*,*,*,*,*,*") @@ -564,18 +569,18 @@ s_mov_b64\t%0, %1 s_mov_b64\t%0, %1 # - s_store_dwordx2\t%1, %A0\;s_waitcnt\texpcnt(0) + s_store_dwordx2\t%1, %A0 s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0) # # # # flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0 - flat_store_dwordx2\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0) - ds_write_b64\t%A0, %1%O0\;s_waitcnt\texpcnt(0) + flat_store_dwordx2\t%A0, %1%O0%g0 + ds_write_b64\t%A0, %1%O0 ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) - global_store_dwordx2\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0)" + global_store_dwordx2\t%A0, %1%O0%g0" "(reload_completed && !MEM_P (operands[0]) && !MEM_P (operands[1]) && !gcn_sgpr_move_p (operands[0], operands[1])) || (GET_CODE (operands[1]) == CONST_INT && !gcn_constant64_p (operands[1]))" @@ -617,16 +622,16 @@ "" "@ # - s_store_dwordx4\t%1, %A0\;s_waitcnt\texpcnt(0) + s_store_dwordx4\t%1, %A0 s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0) - flat_store_dwordx4\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0) + flat_store_dwordx4\t%A0, %1%O0%g0 flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0 # # # - global_store_dwordx4\t%A0, %1%O0%g0\;s_waitcnt\texpcnt(0) + global_store_dwordx4\t%A0, %1%O0%g0 global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) - ds_write_b128\t%A0, %1%O0\;s_waitcnt\texpcnt(0) + ds_write_b128\t%A0, %1%O0 ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)" "reload_completed && REG_P (operands[0]) @@ -647,6 +652,7 @@ } [(set_attr "type" "mult,smem,smem,flat,flat,vmult,vmult,vmult,flat,flat,\ ds,ds") + (set_attr "delayeduse" "*,*,yes,*,*,*,*,*,yes,*,*,*") (set_attr "length" "*,12,12,12,12,*,*,*,12,12,12,12")]) ;; }}} @@ -1594,7 +1600,8 @@ global_atomic_cmpswap<X>\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)" [(set_attr "type" "smem,flat,flat") (set_attr "length" "12") - (set_attr "gcn_version" "gcn5,*,gcn5")]) + (set_attr "gcn_version" "gcn5,*,gcn5") + (set_attr "delayeduse" "*,yes,yes")]) (define_insn "sync_compare_and_swap<mode>_lds_insn" [(set (match_operand:SIDI 0 "register_operand" "= v") @@ -1697,14 +1704,11 @@ switch (which_alternative) { case 0: - return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;" - "s_waitcnt\texpcnt(0)"; + return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc"; case 1: - return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;" - "s_waitcnt\texpcnt(0)"; + return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc"; case 2: - return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;" - "s_waitcnt\texpcnt(0)"; + return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc"; } break; case MEMMODEL_ACQ_REL: @@ -1714,13 +1718,13 @@ { case 0: return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;" - "s_waitcnt\texpcnt(0)\;s_dcache_inv_vol"; + "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol"; case 1: return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;" - "s_waitcnt\texpcnt(0)\;buffer_wbinvl1_vol"; + "s_waitcnt\t0\;buffer_wbinvl1_vol"; case 2: return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;" - "s_waitcnt\texpcnt(0)\;buffer_wbinvl1_vol"; + "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"; } break; }