Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor
Sorry, fixed a couple of typos that prevented the patch from actually working. Here's the updated version. I'll be building on ADDR_QUERY_STR for identifying and preventing pre/post incrementing addresses for stores for falkor. Siddhesh 2018-xx-xx Jim Wilson Kugan Vivenakandarajah Siddhesh Poyarekar gcc/ * gcc/config/aarch64/aarch64-protos.h (aarch64_addr_query_type): New member ADDR_QUERY_STR. * gcc/config/aarch64/aarch64-tuning-flags.def (SLOW_REGOFFSET_QUADWORD_STORE): New. * gcc/config/aarch64/aarch64.c (qdf24xx_tunings): Add SLOW_REGOFFSET_QUADWORD_STORE to tuning flags. (aarch64_classify_address): Avoid register indexing for quad mode stores when SLOW_REGOFFSET_QUADWORD_STORE is set. * gcc/config/aarch64/constraints.md (Uts): New constraint. * gcc/config/aarch64/aarch64.md (movti_aarch64, movtf_aarch64): Use it. * gcc/config/aarch64/aarch64-simd.md (aarch64_simd_mov): Likewise. gcc/testsuite/ * gcc/testsuite/gcc.target/aarch64/pr82533.c: New test case. diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 159bc6aee7e..15924fc3f58 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -120,6 +120,9 @@ enum aarch64_symbol_type ADDR_QUERY_LDP_STP Query what is valid for a load/store pair. + ADDR_QUERY_STR + Query what is valid for a store. + ADDR_QUERY_ANY Query what is valid for at least one memory constraint, which may allow things that "m" doesn't. For example, the SVE LDR and STR @@ -128,6 +131,7 @@ enum aarch64_symbol_type enum aarch64_addr_query_type { ADDR_QUERY_M, ADDR_QUERY_LDP_STP, + ADDR_QUERY_STR, ADDR_QUERY_ANY }; diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 3d1f6a01cb7..48d92702723 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -131,9 +131,9 @@ (define_insn "*aarch64_simd_mov" [(set (match_operand:VQ 0 "nonimmediate_operand" - "=w, Umq, m, w, ?r, ?w, ?r, w") + "=w, Umq, Uts, w, ?r, ?w, ?r, w") (match_operand:VQ 1 "general_operand" - "m, Dz, w, w, w, r, r, Dn"))] + "m, Dz,w, w, w, r, r, Dn"))] "TARGET_SIMD && (register_operand (operands[0], mode) || aarch64_simd_reg_or_zero (operands[1], mode))" diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index ea9ead234cb..04baf5b6de6 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -41,4 +41,8 @@ AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW) are not considered cheap. */ AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND) +/* Don't use a register offset in a memory address for a quad-word store. */ +AARCH64_EXTRA_TUNING_OPTION ("slow_regoffset_quadword_store", +SLOW_REGOFFSET_QUADWORD_STORE) + #undef AARCH64_EXTRA_TUNING_OPTION diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 0599a79bfeb..664d4a18354 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -894,7 +894,7 @@ static const struct tune_params qdf24xx_tunings = 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK,/* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE), /* tune_flags. */ &qdf24xx_prefetch_tune }; @@ -5531,6 +5531,16 @@ aarch64_classify_address (struct aarch64_address_info *info, || vec_flags == VEC_ADVSIMD || vec_flags == VEC_SVE_DATA)); + /* Avoid register indexing for 128-bit stores when the + AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE option is set. */ + if (!optimize_size + && type == ADDR_QUERY_STR + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE) + && (mode == TImode || mode == TFmode + || aarch64_vector_data_mode_p (mode))) +allow_reg_index_p = false; + /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and [Rn, #offset, MUL VL]. */ if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0 diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index edb6a758333..348b867ff7f 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1079,7 +1079,7 @@ (define_insn "*movti_aarch64" [(set (match_operand:TI 0 -"nonimmediate_operand" "=r, w,r,w,r,m,m,w,m") +"nonimmediate_operand" "=r,w,r,w,r,m,m,w,Uts") (match_operand:TI 1 "aarch64_movti_opera
Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor
On Thursday 18 January 2018 01:11 AM, Jim Wilson wrote: > This is the only solution I found that worked. I tried a few things and ended up with pretty much the same fix you have except with the check in a slightly different place. That is, I used aarch64_classify_address to gate the change because I intend to use that to gate pre/post-increments for stores as well for falkor. I also fixed up my submission to incorporate your more recent changes, which was to add an extra tuning option instead of just checking for falkor. Siddhesh 2018-xx-xx Jim Wilson Kugan Vivenakandarajah Siddhesh Poyarekar gcc/ * gcc/config/aarch64/aarch64-protos.h (aarch64_addr_query_type): New member ADDR_QUERY_STR. * gcc/config/aarch64/aarch64-tuning-flags.def (SLOW_REGOFFSET_QUADWORD_STORE): New. * gcc/config/aarch64/aarch64.c (qdf24xx_tunings): Add SLOW_REGOFFSET_QUADWORD_STORE to tuning flags. (aarch64_classify_address): Avoid register indexing for quad mode stores when SLOW_REGOFFSET_QUADWORD_STORE is set. * gcc/config/aarch64/constraints.md (Uts): New constraint. * gcc/config/aarch64/aarch64.md (movti_aarch64, movtf_aarch64): Use it. * gcc/config/aarch64/aarch64-simd.md (aarch64_simd_mov): Likewise. gcc/testsuite/ * gcc/testsuite/gcc.target/aarch64/pr82533.c: New test case. diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 159bc6aee7e..15924fc3f58 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -120,6 +120,9 @@ enum aarch64_symbol_type ADDR_QUERY_LDP_STP Query what is valid for a load/store pair. + ADDR_QUERY_STR + Query what is valid for a store. + ADDR_QUERY_ANY Query what is valid for at least one memory constraint, which may allow things that "m" doesn't. For example, the SVE LDR and STR @@ -128,6 +131,7 @@ enum aarch64_symbol_type enum aarch64_addr_query_type { ADDR_QUERY_M, ADDR_QUERY_LDP_STP, + ADDR_QUERY_STR, ADDR_QUERY_ANY }; diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 3d1f6a01cb7..48d92702723 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -131,9 +131,9 @@ (define_insn "*aarch64_simd_mov" [(set (match_operand:VQ 0 "nonimmediate_operand" - "=w, Umq, m, w, ?r, ?w, ?r, w") + "=w, Umq, Uts, w, ?r, ?w, ?r, w") (match_operand:VQ 1 "general_operand" - "m, Dz, w, w, w, r, r, Dn"))] + "m, Dz,w, w, w, r, r, Dn"))] "TARGET_SIMD && (register_operand (operands[0], mode) || aarch64_simd_reg_or_zero (operands[1], mode))" diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index ea9ead234cb..04baf5b6de6 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -41,4 +41,8 @@ AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW) are not considered cheap. */ AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND) +/* Don't use a register offset in a memory address for a quad-word store. */ +AARCH64_EXTRA_TUNING_OPTION ("slow_regoffset_quadword_store", +SLOW_REGOFFSET_QUADWORD_STORE) + #undef AARCH64_EXTRA_TUNING_OPTION diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 0599a79bfeb..d17f3b70271 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -894,7 +894,7 @@ static const struct tune_params qdf24xx_tunings = 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK,/* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE), /* tune_flags. */ &qdf24xx_prefetch_tune }; @@ -5531,6 +5531,16 @@ aarch64_classify_address (struct aarch64_address_info *info, || vec_flags == VEC_ADVSIMD || vec_flags == VEC_SVE_DATA)); + /* Avoid register indexing for 128-bit stores when the + AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE option is set. */ + if (!optimize_size + && type == ADDR_QUERY_STR + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE) + && (mode == TImode || mode == TFmode + || aarch64_vector_data_mode_p (mode))) +allow_reg_index_p = false; + /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and [Rn, #offset, MUL VL]. */ if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0 diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index edb6a758333..348b867ff7f 100644 --- a/gcc/config/aarch64/aa
Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor
On 01/17/2018 05:37 AM, Wilco Dijkstra wrote: In general I think the best way to achieve this would be to use the existing cost models which are there for exactly this purpose. If this doesn't work well enough then we should fix those. I tried using cost models, and this didn't work, because the costs don't allow us to distinguish between loads and stores. If you mark reg+reg as expensive, you get a performance loss from losing the loads, and a performance gain from losing the stores, and they cancel each other out. this patch disables a whole class of instructions for a specific target rather than simply telling GCC that they are expensive and should only be used if there is no cheaper alternative. This is the only solution I found that worked. Also there is potential impact on generic code from: (define_insn "*aarch64_simd_mov" [(set (match_operand:VQ 0 "nonimmediate_operand" - "=w, Umq, m, w, ?r, ?w, ?r, w") + "=w, Umq, Utf, w, ?r, ?w, ?r, w") (match_operand:VQ 1 "general_operand" - "m, Dz, w, w, w, r, r, Dn"))] + "m, Dz,w, w, w, r, r, Dn"))] It seems an 'm' constraint has special meaning in the register allocator, using a different constraint can block certain simplifications (for example merging stack offsets into load/store in the post-reload cleanup pass), so we'd need to verify this doesn't cause regressions. No optimizer should be checking for 'm'. They should be checking for CT_MEMORY, which indicates a constraint that accepts memory. Utf is properly marked as a memory constraint. I did some testing to verify that the patch would not affect other aarch64 targets at the time, though I don't recall now exactly what I did. Jim
Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor
On Wednesday 17 January 2018 11:13 PM, Wilco Dijkstra wrote: > Are you saying the same issue exists for all stores with writeback? If so then > your patch would need to address that too. Yes, I'll be posting a separate patch for that because the condition set is slightly different for it. It will also be accompanied with a slightly different tuning for addrcost, which is why it needs separate testing. > It seems way more fundamental if it affects anything that isn't a simple > immediate offset. Again I suggest using the existing cost infrastructure > to find a setting that improves performance. If discouraging pre/post > increment > helps Falkor then that's better than doing nothing. The existing costs don't differentiate between loads and stores and that is specifically what I need for falkor. >>> I think a special case for Falkor in aarch64_address_cost would be >>> acceptable >>> in GCC8 - that would be much smaller and cleaner than the current patch. >>> If required we could improve upon this in GCC9 and add a way to >>> differentiate >>> between loads and stores. >> >> I can't do this in address_cost since I can't determine whether the >> address is a load or a store location. The most minimal way seems to be >> using the patterns in the md file. > > Well I don't think the approach of blocking specific patterns is a good > solution to > this problem and may not be accepted by AArch64 maintainers. Try your example > with -funroll-loops and compare with my suggestion (with or without extra > code to > increase cost of writeback too). It seems to me adjusting costs is always > going to > result in better overall code quality, even if it also applies to loads for > the time being. Costs are not useful for this scenario because they cannot differentiate between loads and stores. To make that distinction I have to block specific patterns, unless there's a better way I'm unaware of that helps determine whether a memory reference is a load or a store. Another approach I am trying to minimize the change is to add a new ADDR_QUERY_STR for aarch64_legitimate_address_p, which can then be used in classify_address to skip register addressing mode for falkor. That way we avoid the additional hook. It will still need the additional Utf memory constraint though. Do you know of a way I can distinguish between loads and stores in costs tuning? Siddhesh
Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor
Siddhesh Poyarekar wrote: On Wednesday 17 January 2018 08:31 PM, Wilco Dijkstra wrote: > Why is that a bad thing? With the patch as is, the testcase generates: > > .L4: > ldr q0, [x2, x3] > add x5, x1, x3 > add x3, x3, 16 > cmp x3, x4 > str q0, [x5] > bne .L4 > > With a change in address cost (for loads and stores) we would get: > > .L4: > ldr q0, [x3], 16 > str q0, [x4], 16 > cmp x3, x5 > bne .L4 > This is great for the load because of the way the falkor prefetcher > works, but it is terrible for the store because of the way the pipeline > works. The only performant store for falkor is an indirect load with a > constant or zero offset. Everything else has hidden costs. Are you saying the same issue exists for all stores with writeback? If so then your patch would need to address that too. > I briefly looked at the possibility of splitting the register_offset > cost into load and store, but I realized that I'd have to modify the > target hook for it to be useful, which is way too much work for this > single quirk. It seems way more fundamental if it affects anything that isn't a simple immediate offset. Again I suggest using the existing cost infrastructure to find a setting that improves performance. If discouraging pre/post increment helps Falkor then that's better than doing nothing. >> I think a special case for Falkor in aarch64_address_cost would be acceptable >> in GCC8 - that would be much smaller and cleaner than the current patch. >> If required we could improve upon this in GCC9 and add a way to differentiate >> between loads and stores. > > I can't do this in address_cost since I can't determine whether the > address is a load or a store location. The most minimal way seems to be > using the patterns in the md file. Well I don't think the approach of blocking specific patterns is a good solution to this problem and may not be accepted by AArch64 maintainers. Try your example with -funroll-loops and compare with my suggestion (with or without extra code to increase cost of writeback too). It seems to me adjusting costs is always going to result in better overall code quality, even if it also applies to loads for the time being. Wilco
Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor
On Wednesday 17 January 2018 08:31 PM, Wilco Dijkstra wrote: > Why is that a bad thing? With the patch as is, the testcase generates: > > .L4: > ldr q0, [x2, x3] > add x5, x1, x3 > add x3, x3, 16 > cmp x3, x4 > str q0, [x5] > bne .L4 > > With a change in address cost (for loads and stores) we would get: > > .L4: > ldr q0, [x3], 16 > str q0, [x4], 16 > cmp x3, x5 > bne .L4 > > This looks better to me, especially if there are more loads and stores and > some have offsets as well (the writeback is once per stream while the extra > add happens for every store). It may be worth trying both possibilities > on a large body of code and see which comes out smallest/fastest. This is great for the load because of the way the falkor prefetcher works, but it is terrible for the store because of the way the pipeline works. The only performant store for falkor is an indirect load with a constant or zero offset. Everything else has hidden costs. > Note using the cost model as intended means the compiler tries to use the > lowest cost possibility rather than never emitting the instruction, not even > when optimizing for size. I think it's wrong to always block a valid > instruction. > It's not clear whether it is easy to split out the costs today (it could be > done > in aarch64_rtx_costs but not aarch64_address_cost, and the latter is what > IVOpt uses). I briefly looked at the possibility of splitting the register_offset cost into load and store, but I realized that I'd have to modify the target hook for it to be useful, which is way too much work for this single quirk. >> Further, it seems like worthwhile work only if there are other parts >> that actually have the same quirk and can use this split. Do you know >> of any such cores? > > Currently there are several supported CPUs which use a much higher cost > for TImode and for register offsets. So it's a common thing to want, however > I don't know whether splitting load/store address costs helps for those. It wouldn't. This ought to be expressed already using the addr_scale_costs. > I think a special case for Falkor in aarch64_address_cost would be acceptable > in GCC8 - that would be much smaller and cleaner than the current patch. > If required we could improve upon this in GCC9 and add a way to differentiate > between loads and stores. I can't do this in address_cost since I can't determine whether the address is a load or a store location. The most minimal way seems to be using the patterns in the md file. Siddhesh
Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor
Siddhesh Poyarekar wrote: > The current cost model will disable reg offset for loads as well as > stores, which doesn't work well since loads with reg offset are faster > for falkor. Why is that a bad thing? With the patch as is, the testcase generates: .L4: ldr q0, [x2, x3] add x5, x1, x3 add x3, x3, 16 cmp x3, x4 str q0, [x5] bne .L4 With a change in address cost (for loads and stores) we would get: .L4: ldr q0, [x3], 16 str q0, [x4], 16 cmp x3, x5 bne .L4 This looks better to me, especially if there are more loads and stores and some have offsets as well (the writeback is once per stream while the extra add happens for every store). It may be worth trying both possibilities on a large body of code and see which comes out smallest/fastest. Note using the cost model as intended means the compiler tries to use the lowest cost possibility rather than never emitting the instruction, not even when optimizing for size. I think it's wrong to always block a valid instruction. > Also, this is a very specific tweak for a specific processor, i.e. I > don't know if there is value in splitting out the costs into loads and > stores and further into 128-bit and lower just to set the 128 store cost > higher. That will increase the size of the change by quite a bit and > may not make it suitable for inclusion into gcc8 at this stage, while > the current one still qualifies given its contained impact. It's not clear whether it is easy to split out the costs today (it could be done in aarch64_rtx_costs but not aarch64_address_cost, and the latter is what IVOpt uses). > Further, it seems like worthwhile work only if there are other parts > that actually have the same quirk and can use this split. Do you know > of any such cores? Currently there are several supported CPUs which use a much higher cost for TImode and for register offsets. So it's a common thing to want, however I don't know whether splitting load/store address costs helps for those. I think a special case for Falkor in aarch64_address_cost would be acceptable in GCC8 - that would be much smaller and cleaner than the current patch. If required we could improve upon this in GCC9 and add a way to differentiate between loads and stores. Wilco
Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor
On Wednesday 17 January 2018 07:07 PM, Wilco Dijkstra wrote: > (finished version this time, somehow Outlook loves to send emails early...) > > Hi, > > In general I think the best way to achieve this would be to use the > existing cost models which are there for exactly this purpose. If > this doesn't work well enough then we should fix those. As is, > this patch disables a whole class of instructions for a specific > target rather than simply telling GCC that they are expensive and > should only be used if there is no cheaper alternative. The current cost model will disable reg offset for loads as well as stores, which doesn't work well since loads with reg offset are faster for falkor. Also, this is a very specific tweak for a specific processor, i.e. I don't know if there is value in splitting out the costs into loads and stores and further into 128-bit and lower just to set the 128 store cost higher. That will increase the size of the change by quite a bit and may not make it suitable for inclusion into gcc8 at this stage, while the current one still qualifies given its contained impact. Further, it seems like worthwhile work only if there are other parts that actually have the same quirk and can use this split. Do you know of any such cores? > Also there is potential impact on generic code from: > > (define_insn "*aarch64_simd_mov" >[(set (match_operand:VQ 0 "nonimmediate_operand" > - "=w, Umq, m, w, ?r, ?w, ?r, w") > + "=w, Umq, Utf, w, ?r, ?w, ?r, w") > (match_operand:VQ 1 "general_operand" > - "m, Dz, w, w, w, r, r, Dn"))] > + "m, Dz,w, w, w, r, r, Dn"))] > > It seems an 'm' constraint has special meaning in the register allocator, > using a different constraint can block certain simplifications (for example > merging stack offsets into load/store in the post-reload cleanup pass), > so we'd need to verify this doesn't cause regressions. I'll verify this. > Also it is best to introduce generic interfaces: > > +/* Return TRUE if OP is a good address mode for movti target on falkor. */ > +bool > +aarch64_falkor_movti_target_operand_p (rtx op) > > +(define_memory_constraint "Utf" > + "@iternal > + A good address for a falkor movti target operand." > + (and (match_code "mem") > + (match_test "aarch64_falkor_movti_target_operand_p (op)"))) > > We should use generic names here even if the current implementation > wants to do something specific for Falkor. I'll fix this. Thanks, Siddhesh
Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor
(finished version this time, somehow Outlook loves to send emails early...) Hi, In general I think the best way to achieve this would be to use the existing cost models which are there for exactly this purpose. If this doesn't work well enough then we should fix those. As is, this patch disables a whole class of instructions for a specific target rather than simply telling GCC that they are expensive and should only be used if there is no cheaper alternative. Also there is potential impact on generic code from: (define_insn "*aarch64_simd_mov" [(set (match_operand:VQ 0 "nonimmediate_operand" - "=w, Umq, m, w, ?r, ?w, ?r, w") + "=w, Umq, Utf, w, ?r, ?w, ?r, w") (match_operand:VQ 1 "general_operand" - "m, Dz, w, w, w, r, r, Dn"))] + "m, Dz,w, w, w, r, r, Dn"))] It seems an 'm' constraint has special meaning in the register allocator, using a different constraint can block certain simplifications (for example merging stack offsets into load/store in the post-reload cleanup pass), so we'd need to verify this doesn't cause regressions. Also it is best to introduce generic interfaces: +/* Return TRUE if OP is a good address mode for movti target on falkor. */ +bool +aarch64_falkor_movti_target_operand_p (rtx op) +(define_memory_constraint "Utf" + "@iternal + A good address for a falkor movti target operand." + (and (match_code "mem") + (match_test "aarch64_falkor_movti_target_operand_p (op)"))) We should use generic names here even if the current implementation wants to do something specific for Falkor. Wilco
Re: [PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor
Hi, In general I think the best way to achieve this would be to use the existing cost models which are there for exactly this purpose. If this doesn't work well enough then we should fix those. As is, this patch disables a whole class of instructions for a specific target rather than simply telling GCC that they are expensive and should only be used if there is no cheaper alternative. Also there is impact on generic code from:
[PING][PATCH, AArch64] Disable reg offset in quad-word store for Falkor
From: Siddhesh Poyarekar Hi, Jim Wilson posted a patch for this in September[1] and it appears following discussions that the patch was an acceptable fix for falkor. Kugan followed up[2] with a test case since that was requested during initial review. Jim has moved on from Linaro, so I'm pinging this patch with the hope that it is OK for inclusion since it was posted before the freeze and is also isolated in impact to just falkor. Siddhesh [1] https://gcc.gnu.org/ml/gcc-patches/2017-09/msg01547.html [2] https://gcc.gnu.org/ml/gcc-patches/2017-11/msg00050.html On Falkor, because of an idiosyncracy of how the pipelines are designed, a quad-word store using a reg+reg addressing mode is almost twice as slow as an add followed by a quad-word store with a single reg addressing mode. So we get better performance if we disallow addressing modes using register offsets with quad-word stores. Using lmbench compiled with -O2 -ftree-vectorize as my benchmark, I see a 13% performance increase on stream copy using this patch, and a 16% performance increase on stream scale using this patch. I also see a small performance increase on SPEC CPU2006 of around 0.2% for int and 0.4% for FP at -O3. 2018-01-17 Jim Wilson Kugan Vivekanandarajah gcc/ * config/aarch64/aarch64-protos.h (aarch64_falkor_movti_target_operand_p): Declare. constraint instead of m. * config/aarch64/aarch64.c (aarch64_falkor_movti_target_operand_p): New. * config/aarch64/constraints.md (Utf): New. * config/aarch64/aarch64.md (movti_aarch64): Use Utf constraint instead of m. (movtf_aarch64): Likewise. * config/aarch64/aarch64-simd.md (aarch64_simd_mov): Use Utf gcc/testsuite/ * gcc/testsuite/gcc.target/aarch64/pr82533.c: New test case. --- gcc/config/aarch64/aarch64-protos.h| 1 + gcc/config/aarch64/aarch64-simd.md | 4 ++-- gcc/config/aarch64/aarch64.c | 10 ++ gcc/config/aarch64/aarch64.md | 6 +++--- gcc/config/aarch64/constraints.md | 6 ++ gcc/testsuite/gcc.target/aarch64/pr82533.c | 11 +++ 6 files changed, 33 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/pr82533.c diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 2d705d2..088d864 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -433,6 +433,7 @@ bool aarch64_simd_mem_operand_p (rtx); bool aarch64_sve_ld1r_operand_p (rtx); bool aarch64_sve_ldr_operand_p (rtx); bool aarch64_sve_struct_memory_operand_p (rtx); +bool aarch64_falkor_movti_target_operand_p (rtx); rtx aarch64_simd_vect_par_cnst_half (machine_mode, int, bool); rtx aarch64_tls_get_addr (void); tree aarch64_fold_builtin (tree, int, tree *, bool); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 3d1f6a0..f7daac3 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -131,9 +131,9 @@ (define_insn "*aarch64_simd_mov" [(set (match_operand:VQ 0 "nonimmediate_operand" - "=w, Umq, m, w, ?r, ?w, ?r, w") + "=w, Umq, Utf, w, ?r, ?w, ?r, w") (match_operand:VQ 1 "general_operand" - "m, Dz, w, w, w, r, r, Dn"))] + "m, Dz,w, w, w, r, r, Dn"))] "TARGET_SIMD && (register_operand (operands[0], mode) || aarch64_simd_reg_or_zero (operands[1], mode))" diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 2e70f3a..0db7a4f 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -13477,6 +13477,16 @@ aarch64_sve_struct_memory_operand_p (rtx op) && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last)); } +/* Return TRUE if OP is a good address mode for movti target on falkor. */ +bool +aarch64_falkor_movti_target_operand_p (rtx op) +{ + if ((enum attr_tune) aarch64_tune == TUNE_FALKOR) +return MEM_P (op) && ! (GET_CODE (XEXP (op, 0)) == PLUS + && ! CONST_INT_P (XEXP (XEXP (op, 0), 1))); + return MEM_P (op); +} + /* Emit a register copy from operand to operand, taking care not to early-clobber source registers in the process. diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index edb6a75..696fd12 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1079,7 +1079,7 @@ (define_insn "*movti_aarch64" [(set (match_operand:TI 0 -"nonimmediate_operand" "=r, w,r,w,r,m,m,w,m") +"nonimmediate_operand" "=r, w,r,w,r,m,m,w,Utf") (match_operand:TI 1 "aarch64_movti_operand" " rn,r,w,w,m,r,Z,m,w"))] "(register_operand (operands[0], TImode) @@ -1226,9 +1226,9 @@ (define_insn "*movtf_aarch64" [(set (match_operand:TF 0 -"nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,m,?r,m ,m") +