Re: [PATCH v2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-16 Thread luoxhu via Gcc-patches



On 2020/9/15 14:51, Richard Biener wrote:


>> I only see VAR_DECL and PARM_DECL, is there any function to check the tree
>> variable is global? I added DECL_REGISTER, but the RTL still expands to 
>> stack:
> 
> is_global_var () or alternatively !auto_var_in_fn_p (), I think doing
> IFN_SET only
> makes sense if there's the chance we can promote the variable to a
> register.  But it
> would be an incorrect transform (it stores the whole vector) if the
> vector storage
> could "escape" to another thread - which means you probably have to check
> !TREE_ADDRESSABLE as well.
> 

The tree of param "u" will be marked ADDRESSABLE when generating 
"VIEW_CONVERT_EXPR(D.3190)[_1] = i;", if check !TREE_ADDRESSABLE, no 
IFN_SET
will be produced in gimple-isel.


#1  0x1066c700 in convert_vector_to_array_for_subscript (loc=5307072, 
vecp=0x7fffc5d0,
index=) at ../../gcc/gcc/c-family/c-common.c:8169
#2  0x10553b54 in build_array_ref (loc=5307072, array=, index=<
trunc_mod_expr 0x759c73a0>) at ../../gcc/gcc/c/c-typeck.c:2668
#3  0x105c8824 in c_parser_postfix_expression_after_primary 
(parser=0x77f703f0, expr_lo
c=5307040, expr=...) at ../../gcc/gcc/c/c-parser.c:10494
#4  0x105c7570 in c_parser_postfix_expression (parser=0x77f703f0) 
at ../../gcc/gcc/c/c-
parser.c:10216

>>
>> My current implementation does:
>>
>> 1)  v = vec_insert (i, u, n);
>>
>> =>gimple:
>> {
>>register __vector signed int D.3190;
>>D.3190 = u;// *new decl and copy u first.*
>>_1 = n & 3;
>>VIEW_CONVERT_EXPR(D.3190)[_1] = i;   // *update op0 of 
>> VIEW_CONVERT_EXPR*
>>_2 = D.3190;
>>...
>> }
>>
>> =>isel:
>> {
>>register __vector signed int D.3190;
>>D.3190 = u_4(D);
>>_1 = n_6(D) & 3;
>>.VEC_SET (, i_7(D), _1);
> 
> why are you passing the address of D.3190 to .VEC_SET?  That will not
> make D.3190
> be expanded to a pseudo.   You really need to have GIMPLE registers
> here (SSA name)
> and thus a return value, leaving the argument unmodified
> 
>D.3190_3 = .VEC_SET (D.3190_4, i_7(D), _1);
> 
> note this is why I asked about the actual CPU instruction - as I read
> Seghers mail
> the instruction modifies a vector register, not memory.
> 

Updated the code and got expected gimple-isel output and ASM for both 2 cases:

pr79251.c.236t.isel:

__attribute__((noinline))
test (__vector signed int u, int i, size_t n)
{
  long unsigned int _1;
  __vector signed int _6;
  vector(4) int _7;
  vector(4) int vec_set_dst_8;

   [local count: 1073741824]:
  _1 = n_2(D) & 3;
  _7 = u;
  vec_set_dst_8 = .VEC_SET (_7, i_4(D), _1);
  u = vec_set_dst_8;
  _6 = u;
  return _6;

}

But tree variable "u" need to be set to "TREE_ADDRESSABLE (view_op0) = 0;"
(Maybe check IFN VEC_SET and set to 0 in discover_nonconstant_array_refs
is better later.) after generating the IFN VEC_SET, otherwise, "u" will still
be expanded to stack since expander:
 "Replacing Expressions:  _7 replace with --> _7 = u;".

Setting "u" to non-addressable also seems really unreasonable as for below
case, as u[n+1] need be ADDRESSABLE:

__attribute__ ((noinline)) vector TYPE
test (vector TYPE u, TYPE i, size_t n)
{
 u[n % 4] = i;
 u[n+1] = i+1;
 return u;
}

=> gimple-isel with both VEC_SET and VIEW_CONVERT_EXPR:

test (__vector signed int u, int i, size_t n)
{
  long unsigned int _1;
  long unsigned int _2;
  int _3;
  __vector signed int _9;
  vector(4) int _10;
  vector(4) int vec_set_dst_11;

   [local count: 1073741824]:
  _1 = n_4(D) & 3;
  _10 = u;
  vec_set_dst_11 = .VEC_SET (_10, i_6(D), _1);
  u = vec_set_dst_11;
  _2 = n_4(D) + 1;
  _3 = i_6(D) + 1;
  VIEW_CONVERT_EXPR(u)[_2] = _3;
  _9 = u;
  return _9;

}


Below code are generating the IFN call, create_tmp_reg or 
create_tmp_reg_ssa_name
seems not create a variable that will be allocated on register?


diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc
index b330cf4c20e..a699022cd09 100644
--- a/gcc/gimple-isel.cc
+++ b/gcc/gimple-isel.cc
...
+  if (!is_global_var (view_op0)
+ && TREE_CODE (TREE_TYPE (view_op0)) == VECTOR_TYPE
+ && tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (view_op0)))
+ && tree_to_uhwi (TYPE_SIZE (TREE_TYPE (view_op0))) == 128
+ && determine_value_range (pos, , ) == VR_RANGE
+ && wi::geu_p (minv, 0)
+ && wi::leu_p (maxv, (128 / GET_MODE_BITSIZE (innermode
+   {
+ location_t loc = gimple_location (stmt);
+ tree var_src = create_tmp_reg (TREE_TYPE (view_op0));
+ tree var_dst
+   = make_temp_ssa_name (TREE_TYPE (view_op0), NULL, "vec_set_dst");
+ TREE_ADDRESSABLE (view_op0) = 0;
+
+ ass_stmt = gimple_build_assign (var_src, view_op0);
+ gimple_set_location (ass_stmt, loc);
+ gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
+
+ new_stmt
+   = gimple_build_call_internal (IFN_VEC_SET, 3, var_src, val, pos);
+
+ gimple_call_set_lhs (new_stmt, var_dst);
+
+ 

Re: [PATCH v2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-14 Thread luoxhu via Gcc-patches



On 2020/9/14 17:47, Richard Biener wrote:

On Mon, Sep 14, 2020 at 10:05 AM luoxhu  wrote:



Not sure whether this reflects the issues you discussed above.
I constructed below test cases and tested with and without this patch,
only if "a+c"(which means store only), the performance is getting bad with
this patch due to extra load/store(about 50% slower).
For "v = vec_insert (i, v, n);" usage, v is always loaded after store, so this
patch will always get big benefit.
But for "v[n % 4] = i;" usage, it depends on whether "v" is used immediately
inside the function or out of the function soon.  Does this mean unify the two
usage to same gimple code not a good idea sometimes?  Or is it possible to
detect the generated IFN ".VEC_INSERT (, i_4(D), _1);" destination "v" is
used not far away inside or out side of the function?


#define TYPE int

vector TYPE v = {1, 2, 3, 4};   // a. global vector.
vector TYPE s = {4, 3, 2, 1};

__attribute__ ((noinline))
vector TYPE
test (vector TYPE u, TYPE i, size_t n)
{
   v[n % 4] = i;


^^^

this should be

u[n % 4] = i;

I guess.  Is the % 4 mandated by the vec_insert semantics btw?


Yes.  Segher pasted the builtin description in his reply.  "v = vec_insert (i, u, 
n);"
is a bit different with "u[n % 4] = i;" since it returns a copy of u instead of 
modify
u. The adjust in lower __builtin_vec_insert_xxx gimple will make a copy of u 
first to
meet the requirements.



If you tested with the above error you probably need to re-measure.


No I did test for u as local instead of global before.  If u is not used very
soon, the performance is almost the same for generating single store or IFN_SET
of inserting with variable index.

source:
__attribute__ ((noinline)) vector TYPE
test (vector TYPE u, TYPE i, size_t n)
{
 u[n % 4] = i;
 vector TYPE r = {0};
 for (long k = 0; k < 100; k++)
   {
 r += v;
   }
 return u+r;
}

=> store hit load is relieved due to long distance.

ASM:
0:  addis 2,12,.TOC.-.LCF0@ha
   addi 2,2,.TOC.-.LCF0@l
   .localentry test,.-test
   addis 10,2,.LANCHOR0@toc@ha
   li 8,50
   xxspltib 32,0
   addi 9,1,-16
   rldic 6,6,2,60
   stxv 34,-16(1)
   addi 10,10,.LANCHOR0@toc@l
   mtctr 8
   xxlor 33,32,32
   stwx 5,9,6  // short store
   lxv 45,0(10)
   .p2align 4,,15
.L2:
   vadduwm 0,0,13
   vadduwm 1,1,13
   bdnz .L2
   vadduwm 0,0,1
   lxv 33,-16(1)   // wide load
   vadduwm 2,0,1
   blr


Then I intend to use "v" there for global memory insert test to separate
the store and load into different function ("v" is stored in function,
but loaded out of function will get different performance for single store
or lxv+xxperm+xxsel+stxv, the inner function doesn't know the distance
between load and store across functions).

Do you mean that we don't need generate IFN_SET if "v" is global memory?
I only see VAR_DECL and PARM_DECL, is there any function to check the tree
variable is global? I added DECL_REGISTER, but the RTL still expands to stack:

gcc/internal-fn.c: rtx to_rtx = expand_expr (view_op0, NULL_RTX, VOIDmode, 
EXPAND_WRITE);

(gdb) p view_op0
$584 = 
(gdb) p DECL_REGISTER(view_op0)
$585 = 1
(gdb) p to_rtx
$586 = (rtx_def *) (mem/c:V4SI (reg/f:DI 112 virtual-stack-vars) [1 D.3190+0 
S16 A128])




On gimple the above function (after fixing it) looks like

   VIEW_CONVERT_EXPR(u)[_1] = i_4(D);

and the IFN idea I had would - for non-global memory 'u' only - transform
this to

   vector_register_2 = u;
   vector_register_3 = .IFN_VEC_SET (vector_register_2, _1, i_4(D));
   u = vector_register_3;

if vec_set can handle variable indexes.  This then becomes a
vec_set on a register and if that was the only variable indexing of 'u'
will also cause 'u' to be expanded as register rather than stack memory.

Note we can't use the direct-optab method here since the vec_set optab
modifies operand 0 which isn't possible in SSA form.  That might hint
at that we eventually want to extend BIT_INSERT_EXPR to handle
a non-constant bit position but for experiments using an alternate
internal function is certainly easier.



My current implementation does:

1)  v = vec_insert (i, u, n);

=>gimple:
{
 register __vector signed int D.3190;
 D.3190 = u;// *new decl and copy u first.*
 _1 = n & 3;
 VIEW_CONVERT_EXPR(D.3190)[_1] = i;   // *update op0 of 
VIEW_CONVERT_EXPR*
 _2 = D.3190;
 ...
}

=>isel:
{
 register __vector signed int D.3190;
 D.3190 = u_4(D);
 _1 = n_6(D) & 3;
 .VEC_SET (, i_7(D), _1);
 _2 = D.3190;
 ...
}


2) u[n % 4] = i;

=>gimple:
{
 __vector signed int D.3191;
 _1 = n & 3;
 VIEW_CONVERT_EXPR(u)[_1] = i;   // *update op0 of VIEW_CONVERT_EXPR*
 D.3191 = u;   
...

}

=>isel:
{
 D.3190 = u_4(D);
 _1 = n_6(D) & 3;
 .VEC_SET (, i_7(D), _1);
 _2 = D.3190;
 v = _2;
}

The IFN ".VEC_SET" behavior quite like the other IFN STORE functions and doesn't
require a dest operand to be set?  Both 1) and 2) are modifying operand 0 of
VIEW_CONVERT_EXPR just 

Re: [PATCH v2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-14 Thread luoxhu via Gcc-patches



On 2020/9/10 18:08, Richard Biener wrote:
> On Wed, Sep 9, 2020 at 6:03 PM Segher Boessenkool
>  wrote:
>>
>> On Wed, Sep 09, 2020 at 04:28:19PM +0200, Richard Biener wrote:
>>> On Wed, Sep 9, 2020 at 3:49 PM Segher Boessenkool
>>>  wrote:

 Hi!

 On Tue, Sep 08, 2020 at 10:26:51AM +0200, Richard Biener wrote:
> Hmm, yeah - I guess that's what should be addressed first then.
> I'm quite sure that in case 'v' is not on the stack but in memory like
> in my case a SImode store is better than what we get from
> vec_insert - in fact vec_insert will likely introduce a RMW cycle
> which is prone to inserting store-data-races?

 The other way around -- if it is in memory, and was stored as vector
 recently, then reading back something shorter from it is prone to
 SHL/LHS problems.  There is nothing special about the stack here, except
 of course it is more likely to have been stored recently if on the
 stack.  So it depends how often it has been stored recently which option
 is best.  On newer CPUs, although they can avoid SHL/LHS flushes more
 often, the penalty is relatively bigger, so memory does not often win.

 I.e.: it needs to be measured.  Intuition is often wrong here.
>>>
>>> But the current method would simply do a direct store to memory
>>> without a preceeding read of the whole vector.
>>
>> The problem is even worse the other way: you do a short store here, but
>> so a full vector read later.  If the store and read are far apart, that
>> is fine, but if they are close (that is on the order of fifty or more
>> insns), there can be problems.
> 
> Sure, but you can't simply load/store a whole vector when the code didn't
> unless you know it will not introduce data races and it will not trap (thus
> the whole vector needs to be at least naturally aligned).
> 
> Also if there's a preceeding short store you will now load the whole vector
> to avoid the short store ... catch-22
> 
>> There often are problems over function calls (where the compiler cannot
>> usually *see* how something is used).
> 
> Yep.  The best way would be to use small loads and larger stores
> which is what CPUs usually tend to handle fine (with alignment
> constraints, etc.).  Of course that's not what either of the "solutions"
> can do.
> 
> That said, since you seem to be "first" in having an instruction
> to insert into a vector at a variable position the idea that we'd
> have to spill anyway for this to be expanded and thus we expand
> the vector to a stack location in the first place falls down.  And
> that's where I'd first try to improve things.
> 
> So what can the CPU actually do?
> 

Not sure whether this reflects the issues you discussed above.
I constructed below test cases and tested with and without this patch,
only if "a+c"(which means store only), the performance is getting bad with
this patch due to extra load/store(about 50% slower).
For "v = vec_insert (i, v, n);" usage, v is always loaded after store, so this
patch will always get big benefit.
But for "v[n % 4] = i;" usage, it depends on whether "v" is used immediately
inside the function or out of the function soon.  Does this mean unify the two
usage to same gimple code not a good idea sometimes?  Or is it possible to
detect the generated IFN ".VEC_INSERT (, i_4(D), _1);" destination "v" is
used not far away inside or out side of the function?


#define TYPE int

vector TYPE v = {1, 2, 3, 4};   // a. global vector.
vector TYPE s = {4, 3, 2, 1};

__attribute__ ((noinline))
vector TYPE
test (vector TYPE u, TYPE i, size_t n)
{
  v[n % 4] = i;
  return u;
}

int main ()
{
  //vector TYPE v = {1, 2, 3, 4};// b. local vector.
  //vector TYPE s = {4, 3, 2, 1};
  vector TYPE r = {0};
  for (long k = 0; k < 3989478945; k++)
{
r += test (s, 254.0f, k);  // c. access different vector. store only.
   //r += test (v, 254.0f, k);  // d. access same vector. load after store 
in callee function.
  //test (s, 254.0f, k); r += v;   //e. load after store out of function.
}
  return r[0];
}


Thanks,
Xionghu



Re: [PATCH v2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-08 Thread luoxhu via Gcc-patches



On 2020/9/8 16:26, Richard Biener wrote:
>> Seems not only pseudo, for example "v = vec_insert (i, v, n);"
>> the vector variable will be store to stack first, then [r112:DI] is a
>> memory here to be processed.  So the patch loads it from stack(insn #10) to
>> temp vector register first, and store to stack again(insn #24) after
>> rs6000_vector_set_var.
> Hmm, yeah - I guess that's what should be addressed first then.
> I'm quite sure that in case 'v' is not on the stack but in memory like
> in my case a SImode store is better than what we get from
> vec_insert - in fact vec_insert will likely introduce a RMW cycle
> which is prone to inserting store-data-races?

Yes, for your case, there is no stack operation and to_rtx is expanded
with BLKmode instead of V4SImode.  Add the to_rtx mode check could workaround
it.  ASM doesn't show store hit load issue.  

optimized:

_1 = i_2(D) % 4;
VIEW_CONVERT_EXPR(x.u)[_1] = a_4(D);

expand:
2: r118:DI=%3:DI
3: r119:DI=%4:DI
4: NOTE_INSN_FUNCTION_BEG
7: r120:DI=unspec[`*.LANCHOR0',%2:DI] 47
  REG_EQUAL `*.LANCHOR0'
8: r122:SI=r118:DI#0
9: {r124:SI=r122:SI/0x4;clobber ca:SI;}
   10: r125:SI=r124:SI<<0x2
   11: r123:SI=r122:SI-r125:SI
  REG_EQUAL r122:SI%0x4
   12: r126:DI=sign_extend(r123:SI)
   13: r127:DI=r126:DI+0x4
   14: r128:DI=r127:DI<<0x2
   15: r129:DI=r120:DI+r128:DI
   16: [r129:DI]=r119:DI#0

 p to_rtx
$319 = (rtx_def *) (mem/c:BLK (reg/f:DI 120) [2 x+0 S32 A128])

asm:
addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
.localentry test,.-test
srawi 9,3,2
addze 9,9
addis 10,2,.LANCHOR0@toc@ha
addi 10,10,.LANCHOR0@toc@l
slwi 9,9,2
subf 9,9,3
extsw 9,9
addi 9,9,4
sldi 9,9,2
stwx 4,10,9
blr


> 
> So - what we need to "fix" is cfgexpand.c marking variably-indexed
> decls as not to be expanded as registers (see
> discover_nonconstant_array_refs).
> 
> I guess one way forward would be to perform instruction
> selection on GIMPLE here and transform
> 
> VIEW_CONVERT_EXPR(D.3185)[_1] = i_6(D)
> 
> to a (direct) internal function based on the vec_set optab.  

I don't quite understand what you mean here.  Do you mean:
ALTIVEC_BUILTIN_VEC_INSERT -> VIEW_CONVERT_EXPR -> internal function -> vec_set
or ALTIVEC_BUILTIN_VEC_INSERT -> internal function -> vec_set?
And which pass to put the selection and transform is acceptable?
Why call it *based on* vec_set optab?  The VIEW_CONVERT_EXPR or internal 
function
is expanded to vec_set optab.

I guess you suggest adding internal function for VIEW_CONVERT_EXPR in gimple,
and do the transform from internal function to vec_set optab in expander?
I doubt my understanding as this looks really over complicated since we
transform from VIEW_CONVERT_EXPR to vec_set optab directly so far...
IIUC, Internal function seems doesn't help much here as Segher said before.


> But then in GIMPLE D.3185 is also still memory (we don't have a variable
> index partial register set operation - BIT_INSERT_EXPR is
> currently specified to receive a constant bit position only).
> 
> At which point after your patch is the stack storage elided?
> 

Stack storage is elided by register reload pass in RTL.


Thanks,
Xionghu


Re: [PATCH v2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-08 Thread luoxhu via Gcc-patches
Hi Richi,

On 2020/9/7 19:57, Richard Biener wrote:
> +  if (TREE_CODE (to) == ARRAY_REF)
> +   {
> + tree op0 = TREE_OPERAND (to, 0);
> + if (TREE_CODE (op0) == VIEW_CONVERT_EXPR
> + && expand_view_convert_to_vec_set (to, from, to_rtx))
> +   {
> + pop_temp_slots ();
> + return;
> +   }
> +   }
> 
> you're placing this at an awkward spot IMHO, after to_rtx expansion
> but disregading parts of  it and compensating just with 'to' matching.
> Is the pieces (offset, bitpos) really too awkward to work with for
> matching?
> 
> Because as written you'll miscompile
> 
> struct X { _vector signed int v; _vector singed int u; } x;
> 
> test(int i, int a)
> {
>x.u[i] = a;
> }
> 
> as I think you'll end up assigning to x.v.

Thanks for pointing out, this case will be a problem for the patch.
I checked with optimize_bitfield_assignment_op, it will return very early
as the mode1 is not VOIDmode, and this is actually not "FIELD op= VAL"
operation?

To be honest, I am not quite familiar with this part of code, I put the new
function expand_view_convert_to_vec_set just after to_rtx expansion because
adjust_address will change the V4SImode memory to SImode memory, but I need
keep target to_rtx V4SImode to save the vector after calling
rs6000_vector_set_var, so seems paradoxical here?

 p to_rtx
$264 = (rtx_def *) (mem/c:V4SI (reg/f:DI 112 virtual-stack-vars) [1 D.3186+0 
S16 A128])

=> to_rtx = adjust_address (to_rtx, mode1, 0);

p to_rtx
$265 = (rtx_def *) (mem/c:SI (reg/f:DI 112 virtual-stack-vars) [1 D.3186+0 S4 
A128])


> 
> Are we just interested in the case were we store to a
> pseudo or also when the destination is memory?  I guess
> only when it's a pseudo - correct?  In that case
> handling this all in optimize_bitfield_assignment_op
> is probably the best thing to try.
> 
> Note we possibly refrain from assigning a pseudo to
> such vector because we see a variable array-ref to it.

Seems not only pseudo, for example "v = vec_insert (i, v, n);"
the vector variable will be store to stack first, then [r112:DI] is a
memory here to be processed.  So the patch loads it from stack(insn #10) to
temp vector register first, and store to stack again(insn #24) after
rs6000_vector_set_var.

optimized:

D.3185 = v_3(D);
_1 = n_5(D) & 3;
VIEW_CONVERT_EXPR(D.3185)[_1] = i_6(D);
v_8 = D.3185;
return v_8;

=> expand without the patch:

2: r119:V4SI=%2:V4SI
3: r120:DI=%5:DI
4: r121:DI=%6:DI
5: NOTE_INSN_FUNCTION_BEG
8: [r112:DI]=r119:V4SI

9: r122:DI=r121:DI&0x3
   10: r123:DI=r122:DI<<0x2
   11: r124:DI=r112:DI+r123:DI
   12: [r124:DI]=r120:DI#0

   13: r126:V4SI=[r112:DI]
   14: r118:V4SI=r126:V4SI
   18: %2:V4SI=r118:V4SI
   19: use %2:V4SI

=> expand with the patch (replace #9~#12 to #10~#24):

2: r119:V4SI=%2:V4SI
3: r120:DI=%5:DI
4: r121:DI=%6:DI
5: NOTE_INSN_FUNCTION_BEG
8: [r112:DI]=r119:V4SI
9: r122:DI=r121:DI&0x3

   10: r123:V4SI=[r112:DI] // load from stack
   11: {r125:SI=0x3-r122:DI#0;clobber ca:SI;}
   12: r125:SI=r125:SI<<0x2
   13: {r125:SI=0x14-r125:SI;clobber ca:SI;}
   14: r128:DI=unspec[`*.LC0',%2:DI] 47
  REG_EQUAL `*.LC0'
   15: r127:V2DI=[r128:DI]
  REG_EQUAL const_vector
   16: r126:V16QI=r127:V2DI#0
   17: r129:V16QI=unspec[r120:DI#0] 61
   18: r130:V16QI=unspec[r125:SI] 151
   19: r131:V16QI=unspec[r129:V16QI,r129:V16QI,r130:V16QI] 232
   20: r132:V16QI=unspec[r126:V16QI,r126:V16QI,r130:V16QI] 232
   21: r124:V16QI=r123:V4SI#0
   22: r124:V16QI={(r132:V16QI!=const_vector)?r131:V16QI:r124:V16QI}
   23: r123:V4SI=r124:V16QI#0
   24: [r112:DI]=r123:V4SI   // store to stack.

   25: r134:V4SI=[r112:DI]
   26: r118:V4SI=r134:V4SI
   30: %2:V4SI=r118:V4SI
   31: use %2:V4SI


Thanks,
Xionghu


[PATCH v2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-06 Thread luoxhu via Gcc-patches

Hi,

On 2020/9/4 18:23, Segher Boessenkool wrote:

diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index 03b00738a5e..00c65311f76 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
/* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0. */
@@ -1654,15 +1656,8 @@ altivec_resolve_overloaded_builtin (location_t loc, tree 
fndecl,
   SET_EXPR_LOCATION (stmt, loc);
   stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt);
 }
-
-  innerptrtype = build_pointer_type (arg1_inner_type);
-
-  stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0);
-  stmt = convert (innerptrtype, stmt);
-  stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1);
-  stmt = build_indirect_ref (loc, stmt, RO_NULL);
-  stmt = build2 (MODIFY_EXPR, TREE_TYPE (stmt), stmt,
-convert (TREE_TYPE (stmt), arg0));
+  stmt = build_array_ref (loc, stmt, arg2);
+  stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt, arg0);
stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
return stmt;
  }

You should make a copy of the vector, not modify the original one in
place?  (If I read that correctly, heh.)  Looks good otherwise.



Segher, there  is already existed code to make a copy of vector as we
discussed offline.  Thanks for the reminder.

cat pr79251.c.006t.gimple
__attribute__((noinline))
test (__vector signed int v, int i, size_t n)
{
 __vector signed int D.3192;
 __vector signed int D.3194;
 __vector signed int v1;
 v1 = v;
 D.3192 = v1;
 _1 = n & 3;
 VIEW_CONVERT_EXPR(D.3192)[_1] = i;
 v1 = D.3192;
 D.3194 = v1;
 return D.3194;
}

Attached the v2 patch which does:
1) Build VIEW_CONVERT_EXPR for vec_insert (i, v, n) like v[n%4] = i to
unify the gimple code, then expander could use vec_set_optab to expand.
2) Recognize the pattern in expander and use optabs to expand
VIEW_CONVERT_EXPR to vec_insert with variable index to fast instructions:
lvsl+xxperm+xxsel. 


Thanks,
Xionghu
From 9a7a47b086579e26ae13f378732cea067b64a4e6 Mon Sep 17 00:00:00 2001
From: Xiong Hu Luo 
Date: Wed, 19 Aug 2020 03:54:17 -0500
Subject: [PATCH v2] rs6000: Expand vec_insert in expander instead of gimple
 [PR79251]

vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value
to be insert, arg2 is the place to insert arg1 to arg0.  Current expander
generates stxv+stwx+lxv if arg2 is variable instead of constant, which
causes serious store hit load performance issue on Power.  This patch tries
 1) Build VIEW_CONVERT_EXPR for vec_insert (i, v, n) like v[n%4] = i to
unify the gimple code, then expander could use vec_set_optab to expand.
 2) Recognize the pattern in expander and use optabs to expand
VIEW_CONVERT_EXPR to vec_insert with variable index to fast instructions:
lvsl+xxperm+xxsel.
In this way, "vec_insert (i, v, n)" and "v[n%4] = i" won't be expanded too
early in gimple stage if arg2 is variable, avoid generating store hit load
instructions.

For Power9 V4SI:
addi 9,1,-16
rldic 6,6,2,60
stxv 34,-16(1)
stwx 5,9,6
lxv 34,-16(1)
=>
addis 9,2,.LC0@toc@ha
addi 9,9,.LC0@toc@l
mtvsrwz 33,5
lxv 32,0(9)
sradi 9,6,2
addze 9,9
sldi 9,9,2
subf 9,9,6
subfic 9,9,3
sldi 9,9,2
subfic 9,9,20
lvsl 13,0,9
xxperm 33,33,45
xxperm 32,32,45
xxsel 34,34,33,32

Though instructions increase from 5 to 15, the performance is improved
60% in typical cases.
Tested with V2DI, V2DF V4SI, V4SF, V8HI, V16QI on Power9-LE and
Power8-BE, bootstrap tested pass.

gcc/ChangeLog:

2020-09-07  Xionghu Luo  

* config/rs6000/altivec.md (altivec_lvsl_reg_2): Rename to
 (altivec_lvsl_reg_2) and extend to SDI mode.
* config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
Ajdust variable index vec_insert to VIEW_CONVERT_EXPR.
* config/rs6000/rs6000-protos.h (rs6000_expand_vector_set_var):
New declare.
* config/rs6000/rs6000.c (rs6000_expand_vector_set_var):
New function.
* config/rs6000/rs6000.md (FQHS): New mode iterator.
(FD): New mode iterator.
p8_mtvsrwz_v16qi2: New define_insn.
p8_mtvsrd_v16qi2: New define_insn.
* config/rs6000/vector.md: Add register operand2 match for
vec_set index.
* config/rs6000/vsx.md: Call gen_altivec_lvsl_reg_di2.
* expr.c (expand_view_convert_to_vec_set): New function.
(expand_assignment): Call expand_view_convert_to_vec_set.

gcc/testsuite/ChangeLog:

2020-09-07  Xionghu Luo  

* gcc.target/powerpc/pr79251.c: New test.
* gcc.target/powerpc/pr79251-run.c: New test.
* gcc.target/powerpc/pr79251.h: New header.
---
 gcc/config/rs6000/altivec.md  |   4 +-
 gcc/config/rs6000/rs6000-c.c  |  22 ++-
 gcc/config/rs6000/rs6000-protos.h |   1 +
 

Re: [PATCH] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-04 Thread luoxhu via Gcc-patches



On 2020/9/4 15:23, Richard Biener wrote:
> On Fri, Sep 4, 2020 at 9:19 AM Richard Biener
>  wrote:
>>
>> On Fri, Sep 4, 2020 at 8:38 AM luoxhu  wrote:
>>>
>>>
>>>
>>> On 2020/9/4 14:16, luoxhu via Gcc-patches wrote:
>>>> Hi,
>>>>
>>>>
>>>> Yes, I checked and found that both vec_set and vec_extract doesn't support
>>>> variable index for most targets, store_bit_field_1 and extract_bit_field_1
>>>> would only consider use optabs when index is integer value.  Anyway, it
>>>> shouldn't be hard to extend depending on target requirements.
>>>>
>>>> Another problem is v[n&3]=i and vec_insert(v, i, n) are generating with
>>>> different gimple code:
>>>>
>>>> {
>>>> _1 = n & 3;
>>>> VIEW_CONVERT_EXPR(v1)[_1] = i;
>>>> }
>>>>
>>>> vs:
>>>>
>>>> {
>>>> __vector signed int v1;
>>>> __vector signed int D.3192;
>>>> long unsigned int _1;
>>>> long unsigned int _2;
>>>> int * _3;
>>>>
>>>>  [local count: 1073741824]:
>>>> D.3192 = v_4(D);
>>>> _1 = n_7(D) & 3;
>>>> _2 = _1 * 4;
>>>> _3 =  + _2;
>>>> *_3 = i_8(D);
>>>> v1_10 = D.3192;
>>>> return v1_10;
>>>> }
>>>
>>> Just realized use convert_vector_to_array_for_subscript would generate
>>> "VIEW_CONVERT_EXPR(v1)[_1] = i;" before produce those instructions,
>>>your confirmation and comments will be highly appreciated...  Thanks in
>>> advance. :)
>>
>> I think what the GCC vector extensions produce is generally better
>> so wherever "code generation" for vec_insert resides it should be
>> adjusted to produce the same code.  Same for vec_extract.
> 
> Guess altivec.h, dispatching to __builtin_vec_insert.  Wonder why it wasn't
> 
> #define vec_insert(a,b,c) (a)[c]=(b)
> 
> anyway, you obviously have some lowering of the builtin somewhere in rs6000.c
> and thus can adjust that.
> 

Yes, altivec.h use that style for all vector functions, not sure why.
But this could be adjusted by below patch during front end parsing,
which could also generate  "VIEW_CONVERT_EXPR(D.3192)[_1] = i;"
in gimple, then both v[n&3]=i and vec_insert(v, i, n) could use optabs
in expander:


diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index 03b00738a5e..00c65311f76 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
   /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0. */
@@ -1654,15 +1656,8 @@ altivec_resolve_overloaded_builtin (location_t loc, tree 
fndecl,
  SET_EXPR_LOCATION (stmt, loc);
  stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt);
}
-
-  innerptrtype = build_pointer_type (arg1_inner_type);
-
-  stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0);
-  stmt = convert (innerptrtype, stmt);
-  stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1);
-  stmt = build_indirect_ref (loc, stmt, RO_NULL);
-  stmt = build2 (MODIFY_EXPR, TREE_TYPE (stmt), stmt,
-convert (TREE_TYPE (stmt), arg0));
+  stmt = build_array_ref (loc, stmt, arg2);
+  stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt, arg0);
   stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
   return stmt;
 }


Re: [PATCH] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-04 Thread luoxhu via Gcc-patches




On 2020/9/4 14:16, luoxhu via Gcc-patches wrote:

Hi,


Yes, I checked and found that both vec_set and vec_extract doesn't support
variable index for most targets, store_bit_field_1 and extract_bit_field_1
would only consider use optabs when index is integer value.  Anyway, it
shouldn't be hard to extend depending on target requirements.

Another problem is v[n&3]=i and vec_insert(v, i, n) are generating with
different gimple code:

{
_1 = n & 3;
VIEW_CONVERT_EXPR(v1)[_1] = i;
}

vs:

{
   __vector signed int v1;
   __vector signed int D.3192;
   long unsigned int _1;
   long unsigned int _2;
   int * _3;

[local count: 1073741824]:
   D.3192 = v_4(D);
   _1 = n_7(D) & 3;
   _2 = _1 * 4;
   _3 =  + _2;
   *_3 = i_8(D);
   v1_10 = D.3192;
   return v1_10;
}


Just realized use convert_vector_to_array_for_subscript would generate
"VIEW_CONVERT_EXPR(v1)[_1] = i;" before produce those instructions, 
 your confirmation and comments will be highly appreciated...  Thanks in 
advance. :)


Xionghu



If not use builtin for "vec_insert(v, i, n)", the pointer is "int*" instead
of vector type, will this be difficult for expander to capture so many
statements then call the optabs?  So shall we still keep the builtin style
for "vec_insert(v, i, n)" and expand "v[n&3]=i" with optabs or expand both
with optabs???

Drafted a fast patch to expand "v[n&3]=i" with optabs as below, sorry that not
using existed vec_set yet as not quite sure, together with the first patch, both
cases could be handled as expected:


[PATCH] Expander: expand VIEW_CONVERT_EXPR to vec_insert with variable index

v[n%4] = i has same semantic with vec_insert (i, v, n), but it will be
optimized to "VIEW_CONVERT_EXPR(v1)[_1] = i;" in gimple, this
patch tries to recognize the pattern in expander and use optabs to
expand it to fast instructions like vec_insert: lvsl+xxperm+xxsel.

gcc/ChangeLog:

* config/rs6000/vector.md:
* expr.c (expand_assignment):
* optabs.def (OPTAB_CD):
---
  gcc/config/rs6000/vector.md | 13 +++
  gcc/expr.c  | 46 +
  gcc/optabs.def  |  1 +
  3 files changed, 60 insertions(+)

diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md
index 796345c80d3..46d21271e17 100644
--- a/gcc/config/rs6000/vector.md
+++ b/gcc/config/rs6000/vector.md
@@ -1244,6 +1244,19 @@ (define_expand "vec_extract"
DONE;
  })

+(define_expand "vec_insert"
+  [(match_operand:VEC_E 0 "vlogical_operand")
+   (match_operand: 1 "register_operand")
+   (match_operand 2 "register_operand")]
+  "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)"
+{
+  rtx target = gen_reg_rtx (V16QImode);
+  rs6000_expand_vector_insert (target, operands[0], operands[1], operands[2]);
+  rtx sub_target = simplify_gen_subreg (GET_MODE(operands[0]), target, 
V16QImode, 0);
+  emit_insn (gen_rtx_SET (operands[0], sub_target));
+  DONE;
+})
+
  ;; Convert double word types to single word types
  (define_expand "vec_pack_trunc_v2df"
[(match_operand:V4SF 0 "vfloat_operand")
diff --git a/gcc/expr.c b/gcc/expr.c
index dd2200ddea8..ce2890c1a2d 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -5237,6 +5237,52 @@ expand_assignment (tree to, tree from, bool nontemporal)

to_rtx = expand_expr (tem, NULL_RTX, VOIDmode, EXPAND_WRITE);

+  tree type = TREE_TYPE (to);
+  if (TREE_CODE (to) == ARRAY_REF && tree_fits_uhwi_p (TYPE_SIZE (type))
+ && tree_fits_uhwi_p (TYPE_SIZE_UNIT (type))
+ && tree_to_uhwi (TYPE_SIZE (type))
+* tree_to_uhwi (TYPE_SIZE_UNIT (type))
+  == 128)
+   {
+ tree op0 = TREE_OPERAND (to, 0);
+ tree op1 = TREE_OPERAND (to, 1);
+ if (TREE_CODE (op0) == VIEW_CONVERT_EXPR)
+   {
+ tree view_op0 = TREE_OPERAND (op0, 0);
+ mode = TYPE_MODE (TREE_TYPE (view_op0));
+ if (TREE_CODE (TREE_TYPE (view_op0)) == VECTOR_TYPE)
+   {
+ rtx value
+   = expand_expr (from, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+ rtx pos
+   = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+ rtx temp_target = gen_reg_rtx (mode);
+ emit_move_insn (temp_target, to_rtx);
+
+ machine_mode outermode = mode;
+ scalar_mode innermode = GET_MODE_INNER (outermode);
+ class expand_operand ops[3];
+ enum insn_code icode
+   = convert_optab_handler (vec_insert_optab, innermode,
+outermode);
+
+ if (icode != CODE_FOR_nothing)
+   {
+ pos = convert_to_mode (E_SImode, pos, 0);
+
+   

Re: [PATCH] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-04 Thread luoxhu via Gcc-patches
Hi,

On 2020/9/3 18:29, Richard Biener wrote:
> On Thu, Sep 3, 2020 at 11:20 AM luoxhu  wrote:
>>
>>
>>
>> On 2020/9/2 17:30, Richard Biener wrote:
 so maybe bypass convert_vector_to_array_for_subscript for special 
 circumstance
 like "i = v[n%4]" or "v[n&3]=i" to generate vec_extract or vec_insert 
 builtin
 call a relative simpler method?
>>> I think you have it backward.  You need to work with what
>>> convert_vector_to_array_for_subscript
>>> gives and deal with it during RTL expansion / optimization to generate
>>> more optimal
>>> code for power.  The goal is to have as little target specific
>>> builtins during the GIMPLE
>>> optimization phase (because we cannot work out its semantics in optimizers).
>>
>> OK, got it, will add optabs vec_insert and expand 
>> "VIEW_CONVERT_EXPR(v1)[_1] = i_6(D);"
>> expressions to rs6000_expand_vector_insert instead of builtin call.
>> vec_extract already has optabs and "i = v[n%4]" should be in another patch
>> after this.
> 
> There is already vec_set and vec_extract - the question is whether the 
> expander
> tries those for variable index.
> 

Yes, I checked and found that both vec_set and vec_extract doesn't support
variable index for most targets, store_bit_field_1 and extract_bit_field_1
would only consider use optabs when index is integer value.  Anyway, it
shouldn't be hard to extend depending on target requirements. 

Another problem is v[n&3]=i and vec_insert(v, i, n) are generating with
different gimple code:

{
_1 = n & 3;
VIEW_CONVERT_EXPR(v1)[_1] = i;
}

vs:

{
  __vector signed int v1;
  __vector signed int D.3192;
  long unsigned int _1;
  long unsigned int _2;
  int * _3;

   [local count: 1073741824]:
  D.3192 = v_4(D);
  _1 = n_7(D) & 3;
  _2 = _1 * 4;
  _3 =  + _2; 
  *_3 = i_8(D);
  v1_10 = D.3192;
  return v1_10;
}

If not use builtin for "vec_insert(v, i, n)", the pointer is "int*" instead
of vector type, will this be difficult for expander to capture so many
statements then call the optabs?  So shall we still keep the builtin style
for "vec_insert(v, i, n)" and expand "v[n&3]=i" with optabs or expand both 
with optabs???

Drafted a fast patch to expand "v[n&3]=i" with optabs as below, sorry that not
using existed vec_set yet as not quite sure, together with the first patch, both
cases could be handled as expected:


[PATCH] Expander: expand VIEW_CONVERT_EXPR to vec_insert with variable index

v[n%4] = i has same semantic with vec_insert (i, v, n), but it will be
optimized to "VIEW_CONVERT_EXPR(v1)[_1] = i;" in gimple, this
patch tries to recognize the pattern in expander and use optabs to
expand it to fast instructions like vec_insert: lvsl+xxperm+xxsel.

gcc/ChangeLog:

* config/rs6000/vector.md:
* expr.c (expand_assignment):
* optabs.def (OPTAB_CD):
---
 gcc/config/rs6000/vector.md | 13 +++
 gcc/expr.c  | 46 +
 gcc/optabs.def  |  1 +
 3 files changed, 60 insertions(+)

diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md
index 796345c80d3..46d21271e17 100644
--- a/gcc/config/rs6000/vector.md
+++ b/gcc/config/rs6000/vector.md
@@ -1244,6 +1244,19 @@ (define_expand "vec_extract"
   DONE;
 })
 
+(define_expand "vec_insert"
+  [(match_operand:VEC_E 0 "vlogical_operand")
+   (match_operand: 1 "register_operand")
+   (match_operand 2 "register_operand")]
+  "VECTOR_MEM_ALTIVEC_OR_VSX_P (mode)"
+{
+  rtx target = gen_reg_rtx (V16QImode);
+  rs6000_expand_vector_insert (target, operands[0], operands[1], operands[2]);
+  rtx sub_target = simplify_gen_subreg (GET_MODE(operands[0]), target, 
V16QImode, 0);
+  emit_insn (gen_rtx_SET (operands[0], sub_target));
+  DONE;
+})
+
 ;; Convert double word types to single word types
 (define_expand "vec_pack_trunc_v2df"
   [(match_operand:V4SF 0 "vfloat_operand")
diff --git a/gcc/expr.c b/gcc/expr.c
index dd2200ddea8..ce2890c1a2d 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -5237,6 +5237,52 @@ expand_assignment (tree to, tree from, bool nontemporal)
 
   to_rtx = expand_expr (tem, NULL_RTX, VOIDmode, EXPAND_WRITE);
 
+  tree type = TREE_TYPE (to);
+  if (TREE_CODE (to) == ARRAY_REF && tree_fits_uhwi_p (TYPE_SIZE (type))
+ && tree_fits_uhwi_p (TYPE_SIZE_UNIT (type))
+ && tree_to_uhwi (TYPE_SIZE (type))
+* tree_to_uhwi (TYPE_SIZE_UNIT (type))
+  == 128)
+   {
+ tree op0 = TREE_OPERAND (to, 0);
+ tree op1 = TREE_OPERAND (to, 1);
+ if (TREE_CODE (op0) == VIEW_CONVERT_EXPR)
+   {
+ tree view_op0 = TREE_OPERAND (op0, 0);
+ mode = TYPE_MODE (TREE_TYPE (view_op0));
+ if (TREE_CODE (TREE_TYPE (view_op0)) == VECTOR_TYPE)
+   {
+ rtx value
+   = expand_expr (from, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+ rtx pos
+   = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+ 

Re: [PATCH] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-03 Thread luoxhu via Gcc-patches



On 2020/9/2 17:30, Richard Biener wrote:
>> so maybe bypass convert_vector_to_array_for_subscript for special 
>> circumstance
>> like "i = v[n%4]" or "v[n&3]=i" to generate vec_extract or vec_insert builtin
>> call a relative simpler method?
> I think you have it backward.  You need to work with what
> convert_vector_to_array_for_subscript
> gives and deal with it during RTL expansion / optimization to generate
> more optimal
> code for power.  The goal is to have as little target specific
> builtins during the GIMPLE
> optimization phase (because we cannot work out its semantics in optimizers).

OK, got it, will add optabs vec_insert and expand 
"VIEW_CONVERT_EXPR(v1)[_1] = i_6(D);"
expressions to rs6000_expand_vector_insert instead of builtin call.  
vec_extract already has optabs and "i = v[n%4]" should be in another patch
after this.


Thanks,
Xionghu



Re: [PATCH] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-02 Thread luoxhu via Gcc-patches
Hi,

On 2020/9/1 21:07, Richard Biener wrote:
> On Tue, Sep 1, 2020 at 10:11 AM luoxhu via Gcc-patches
>  wrote:
>>
>> Hi,
>>
>> On 2020/9/1 01:04, Segher Boessenkool wrote:
>>> Hi!
>>>
>>> On Mon, Aug 31, 2020 at 04:06:47AM -0500, Xiong Hu Luo wrote:
>>>> vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value
>>>> to be insert, arg2 is the place to insert arg1 to arg0.  This patch adds
>>>> __builtin_vec_insert_v4si[v4sf,v2di,v2df,v8hi,v16qi] for vec_insert to
>>>> not expand too early in gimple stage if arg2 is variable, to avoid generate
>>>> store hit load instructions.
>>>>
>>>> For Power9 V4SI:
>>>>   addi 9,1,-16
>>>>   rldic 6,6,2,60
>>>>   stxv 34,-16(1)
>>>>   stwx 5,9,6
>>>>   lxv 34,-16(1)
>>>> =>
>>>>   addis 9,2,.LC0@toc@ha
>>>>   addi 9,9,.LC0@toc@l
>>>>   mtvsrwz 33,5
>>>>   lxv 32,0(9)
>>>>   sradi 9,6,2
>>>>   addze 9,9
>>>>   sldi 9,9,2
>>>>   subf 9,9,6
>>>>   subfic 9,9,3
>>>>   sldi 9,9,2
>>>>   subfic 9,9,20
>>>>   lvsl 13,0,9
>>>>   xxperm 33,33,45
>>>>   xxperm 32,32,45
>>>>   xxsel 34,34,33,32
>>>
>>> For v a V4SI, x a SI, j some int,  what do we generate for
>>> v[j&3] = x;
>>> ?
>>> This should be exactly the same as we generate for
>>> vec_insert(x, v, j);
>>> (the builtin does a modulo 4 automatically).
>>
>> No, even with my patch "stxv 34,-16(1);stwx 5,9,6;lxv 34,-16(1)" generated 
>> currently.
>> Is it feasible and acceptable to expand some kind of pattern in expander 
>> directly without
>> builtin transition?
>>
>> I borrowed some of implementation from vec_extract.  For vec_extract, the 
>> issue also exists:
>>
>> source: gimple:  
>>expand:asm:
>> 1) i = vec_extract (v, n);  =>  i = __builtin_vec_ext_v4si (v, n);   => 
>> {r120:SI=unspec[r118:V4SI,r119:DI] 134;...} => slwi 9,6,2   vextuwrx 
>> 3,9,2
>> 2) i = vec_extract (v, 3);  =>  i = __builtin_vec_ext_v4si (v, 3);   => 
>> {r120:SI=vec_select(r118:V4SI,parallel)...} =>  li 9,12  vextuwrx 
>> 3,9,2
>> 3) i = v[n%4];   =>  _1 = n & 3;  i = VIEW_CONVERT_EXPR(v)[_1];  =>  
>>   ...=> stxv 34,-16(1);addi 9,1,-16; rldic 5,5,2,60; lwax 
>> 3,9,5
>> 4) i = v[3]; =>  i = BIT_FIELD_REF ;  =>  
>> {r120:SI=vec_select(r118:V4SI,parallel)...} => li 9,12;   vextuwrx 3,9,2
> 
> Why are 1) and 2) handled differently than 3)/4)?

1) and 2) are calling builtin function vec_extract, it is defined to
 __builtin_vec_extract and will be resolved to ALTIVEC_BUILTIN_VEC_EXTRACT
 by resolve_overloaded_builtin, to generate a call __builtin_vec_ext_v4si
 to be expanded only in RTL. 
3) is access variable v as array type with opcode VIEW_CONVERT_EXPR, I
 guess we should also generate builtin call instead of calling
 convert_vector_to_array_for_subscript to generate VIEW_CONVERT_EXPR
 expression for such kind of usage.
4) is translated to BIT_FIELD_REF with constant bitstart and bitsize,
variable v could also be accessed by register instead of stack, so optabs
could match the rs6000_expand_vector_insert to generate expected instruction
through extract_bit_field.

> 
>> Case 3) also couldn't handle the similar usage, and case 4) doesn't generate 
>> builtin as expected,
>> it just expand to vec_select by coincidence.  So does this mean both 
>> vec_insert and vec_extract
>> and all other similar vector builtins should use IFN as suggested by Richard 
>> Biener, to match the
>> pattern in gimple and expand both constant and variable index in expander?  
>> Will this also be
>> beneficial for other targets except power?  Or we should do that gradually 
>> after this patch
>> approved as it seems another independent issue?  Thanks:)
> 
> If the code generated for 3)/4) isn't optimal you have to figure why
> by tracing the RTL
> expansion code and looking for missing optabs.
> 
> Consider the amount of backend code you need to write if ever using
> those in constexpr
> context ...

It seems too complicated to expand the "i = VIEW_CONVERT_EXPR(v)[_1];"
or "VIEW_CONVERT_EXPR(v1)[_1] = i_6(D);" to
rs6000_expand_vector_inser

Re: [PATCH] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-01 Thread luoxhu via Gcc-patches
Hi,

On 2020/9/1 00:47, will schmidt wrote:
>> +  tmode = TYPE_MODE (TREE_TYPE (arg0));
>> +  mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
>> +  mode2 = TYPE_MODE ((TREE_TYPE (arg2)));
>> +  gcc_assert (VECTOR_MODE_P (tmode));
>> +
>> +  op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
>> +  op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
>> +  op2 = expand_expr (arg2, NULL_RTX, mode2, EXPAND_NORMAL);
>> +
>> +  if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
>> +op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
>> +
>> +  op0 = force_reg (tmode, op0);
>> +  op1 = force_reg (mode1, op1);
>> +  op2 = force_reg (mode2, op2);
>> +
>> +  target = gen_reg_rtx (V16QImode);
> Should that be tmode, or is V16QImode always correct here?


Thanks for the review.
Yes, the target should be TImode here, but the followed call
rs6000_expand_vector_insert needs a lot of emit_insns in it, 
using V16QI could reuse most of patterns in existed md files,
after returning from this function, there will be a convert from
V16QImode to TImode to make the type same:

expr.c: convert_move (target, temp, TYPE_UNSIGNED (TREE_TYPE (exp))); 

and I've tested this with V2DI, V2DF V4SI, V4SF, V8HI, V16QI on
Power9-LE and Power8-BE, the result correctness is ensured.
Other comments are modified.  Will update it later if no disagreements
about the implementation.


Thanks,
Xionghu

> 
>> +  rs6000_expand_vector_insert (target, op0, op1, op2);
>> +
>> +  return target;
>> +}


Re: [PATCH] rs6000: Expand vec_insert in expander instead of gimple [PR79251]

2020-09-01 Thread luoxhu via Gcc-patches
Hi,

On 2020/9/1 01:04, Segher Boessenkool wrote:
> Hi!
> 
> On Mon, Aug 31, 2020 at 04:06:47AM -0500, Xiong Hu Luo wrote:
>> vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value
>> to be insert, arg2 is the place to insert arg1 to arg0.  This patch adds
>> __builtin_vec_insert_v4si[v4sf,v2di,v2df,v8hi,v16qi] for vec_insert to
>> not expand too early in gimple stage if arg2 is variable, to avoid generate
>> store hit load instructions.
>>
>> For Power9 V4SI:
>>  addi 9,1,-16
>>  rldic 6,6,2,60
>>  stxv 34,-16(1)
>>  stwx 5,9,6
>>  lxv 34,-16(1)
>> =>
>>  addis 9,2,.LC0@toc@ha
>>  addi 9,9,.LC0@toc@l
>>  mtvsrwz 33,5
>>  lxv 32,0(9)
>>  sradi 9,6,2
>>  addze 9,9
>>  sldi 9,9,2
>>  subf 9,9,6
>>  subfic 9,9,3
>>  sldi 9,9,2
>>  subfic 9,9,20
>>  lvsl 13,0,9
>>  xxperm 33,33,45
>>  xxperm 32,32,45
>>  xxsel 34,34,33,32
> 
> For v a V4SI, x a SI, j some int,  what do we generate for
>v[j&3] = x;
> ?
> This should be exactly the same as we generate for
>vec_insert(x, v, j);
> (the builtin does a modulo 4 automatically).

No, even with my patch "stxv 34,-16(1);stwx 5,9,6;lxv 34,-16(1)" generated 
currently.
Is it feasible and acceptable to expand some kind of pattern in expander 
directly without
builtin transition?

I borrowed some of implementation from vec_extract.  For vec_extract, the issue 
also exists:

   source: gimple:  
   expand:asm:
1) i = vec_extract (v, n);  =>  i = __builtin_vec_ext_v4si (v, n);   => 
{r120:SI=unspec[r118:V4SI,r119:DI] 134;...} => slwi 9,6,2   vextuwrx 3,9,2  
 
2) i = vec_extract (v, 3);  =>  i = __builtin_vec_ext_v4si (v, 3);   => 
{r120:SI=vec_select(r118:V4SI,parallel)...} =>  li 9,12  vextuwrx 3,9,2
3) i = v[n%4];   =>  _1 = n & 3;  i = VIEW_CONVERT_EXPR(v)[_1];  => 
   ...=> stxv 34,-16(1);addi 9,1,-16; rldic 5,5,2,60; lwax 3,9,5
4) i = v[3]; =>  i = BIT_FIELD_REF ;  =>  
{r120:SI=vec_select(r118:V4SI,parallel)...} => li 9,12;   vextuwrx 3,9,2 

Case 3) also couldn't handle the similar usage, and case 4) doesn't generate 
builtin as expected,
it just expand to vec_select by coincidence.  So does this mean both vec_insert 
and vec_extract
and all other similar vector builtins should use IFN as suggested by Richard 
Biener, to match the
pattern in gimple and expand both constant and variable index in expander?  
Will this also be
beneficial for other targets except power?  Or we should do that gradually 
after this patch
approved as it seems another independent issue?  Thanks:)


Xionghu


Re: [PATCH] ipa-inline: Improve growth accumulation for recursive calls

2020-08-14 Thread luoxhu via Gcc-patches
Hi,

On 2020/8/13 20:52, Jan Hubicka wrote:
>> Since there are no other callers outside of these specialized nodes, the
>> guessed profile count should be same equal?  Perf tool shows that even
>> each specialized node is called only once, none of them take same time for
>> each call:
>>
>>40.65%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_digits_2.constprop.4
>>16.31%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_digits_2.constprop.3
>>10.91%  exchange2_gcc.o  libgfortran.so.5.0.0 [.] 
>> _gfortran_mminloc0_4_i4
>> 5.41%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_digits_2.constprop.6
>> 4.68%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __logic_MOD_new_solver
>> 3.76%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_digits_2.constprop.5
>> 1.07%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_digits_2.constprop.7
>> 0.84%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_brute.constprop.0
>> 0.47%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_digits_2.constprop.2
>> 0.24%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_digits_2.constprop.1
>> 0.24%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_covered.constprop.0
>> 0.11%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_reflected.constprop.0
>> 0.00%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
>> __brute_force_MOD_brute.constprop.1
>>
>>
>> digits_2.constprop.4 & digits_2.constprop.3 takes most of the execution time,
>> So profile count and frequency seem not very helpful for this case?
> Yep, you can not really determine the time spent on each of recursion
> levels from the recursive edge probability since you can not assume it
> to be the same on each level of recursion (here we know it is 0 at level
> 10 and it is slowly dropping as the recursion increases because there
> are fewer posiblities to fill up the partly filled sudoku:).
> Especially you can not assume it after the ipa-cp did the work to figure
> out that there is recursion depth counter that affect the function.
> 
> Thinking of the ipa-cp profile updating changes, I did not consider safe
> the approach of adding extra weight to the recursive call. The problem
> is that the recursive call frequency estimate, when comming from static
> profile stimation, is likely completely off, just like static profile
> estimator can not be used to predict realistically the number of
> iterations of loops.  However even with profile feedback this is hard to
> interpret.
> 
> I was wondering if we can not simply detect this scenario and avoid
> using the recursive edge probability in the profile updating.
> We could simply scale by the number of copies.
> This means that if we produce 10 clones, we could just set each clone to
> 1/10th of the frequency.  This implies that the hottest spot will not be
> underestimated expontentially much as can happen now, but just 10 times
> at worst, wich is probably still OK. We play similar games in loop
> optimizers: static profile estimators expect every loop to trip around 3
> times so unroller/vectorizer/etc would make no sense on them.
> 
> By scaling down according to number of copies we will keep the
> frequencies of calls to function called from clones "sane".  We will
> still run into problems when we inline one clone to another (sine its
> proflie will get scaled by the recursive edge frequency), but perhaps
> that can be special cases in profiler or make ipa-cp to adjust the
> recursive edges to get frequency close to 1 as a hack.
> 
> This is not pretty, but at least it looks safer to me than the original
> profile updating patch adding extra weight to the frequency.


Really appreciate for your detailed explanation.  BTW, My previous patch
for PGO build on exchange2 takes this similar method by setting each cloned
node to 1/10th of the frequency several month agao :)

https://gcc.gnu.org/pipermail/gcc-patches/2020-June/546926.html

Xionghu


Re: [PATCH] ipa-inline: Improve growth accumulation for recursive calls

2020-08-13 Thread luoxhu via Gcc-patches
Hi,

On 2020/8/13 01:53, Jan Hubicka wrote:
> Hello,
> with Martin we spent some time looking into exchange2 and my
> understanding of the problem is the following:
> 
> There is the self recursive function digits_2 with the property that it
> has 10 nested loops and calls itself from the innermost.
> Now we do not do amazing job on guessing the profile since it is quite
> atypical. First observation is that the callback frequencly needs to be
> less than 1 otherwise the program never terminates, however with 10
> nested loops one needs to predict every loop to iterate just few times
> and conditionals guarding them as not very likely. For that we added
> PRED_LOOP_GUARD_WITH_RECURSION some time ago and I fixed it yesterday
> (causing regression in exhange since the bad profile turned out to
> disable some harmful vectorization) and I also now added a cap to the
> self recursive frequency so things to not get mispropagated by ipa-cp.

Thanks for the information :)  Tamar replied that there is another
regression *on exchange2 is 11%.*, I've also rebased my code and confirmed
it really getting even slower than before (revert the patch could pull the
performance back)...

> 
> Now if ipa-cp decides to duplicate digits few times we have a new
> problem.  The tree of recursion is orgnaized in a way that the depth is
> bounded by 10 (which GCC does not know) and moreover most time is not
> spent on very deep levels of recursion.
> 
> For that you have the patch which increases frequencies of recursively
> cloned nodes, however it still seems to me as very specific hack for
> exchange: I do not see how to guess where most of time is spent.
> Even for very regular trees, by master theorem, it depends on very
> little differences in the estimates of recursion frequency whether most
> of time is spent on the top of tree, bottom or things are balanced.

The build is not PGO, so I am not clear how profile count will affect the 
ipa-cp and ipa-inline decision. 
Since there are no other callers outside of these specialized nodes, the
guessed profile count should be same equal?  Perf tool shows that even
each specialized node is called only once, none of them take same time for
each call:

  40.65%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_digits_2.constprop.4
  16.31%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_digits_2.constprop.3
  10.91%  exchange2_gcc.o  libgfortran.so.5.0.0 [.] _gfortran_mminloc0_4_i4
   5.41%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_digits_2.constprop.6
   4.68%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] __logic_MOD_new_solver
   3.76%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_digits_2.constprop.5
   1.07%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_digits_2.constprop.7
   0.84%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_brute.constprop.0
   0.47%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_digits_2.constprop.2
   0.24%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_digits_2.constprop.1
   0.24%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_covered.constprop.0
   0.11%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_reflected.constprop.0
   0.00%  exchange2_gcc.o  exchange2_gcc.orig.slow  [.] 
__brute_force_MOD_brute.constprop.1


digits_2.constprop.4 & digits_2.constprop.3 takes most of the execution time,
So profile count and frequency seem not very helpful for this case? 

> 
> With algorithms doing backtracing, like exhchange, the likelyness of
> recusion reduces with deeper recursion level, but we do not know how
> quickly and what the level is.


> 
>> From: Xiong Hu Luo 
>>
>> For SPEC2017 exchange2, there is a large recursive functiondigits_2(function
>> size 1300) generates specialized node from digits_2.1 to digits_2.8 with 
>> added
>> build option:
>>
>> --param ipa-cp-eval-threshold=1 --param ipa-cp-unit-growth=80
>>
>> ipa-inline pass will consider inline these nodes called only once, but these
>> large functions inlined too deeply will cause serious register spill and
>> performance down as followed.
>>
>> inlineA: brute (inline digits_2.1, 2.2, 2.3, 2.4) -> digits_2.5 (inline 2.6, 
>> 2.7, 2.8)
>> inlineB: digits_2.1 (inline digits_2.2, 2.3) -> call digits_2.4 (inline 
>> digits_2.5, 2.6) -> call digits_2.7 (inline 2.8)
>> inlineC: brute (inline digits_2) -> call 2.1 -> 2.2 (inline 2.3) -> 2.4 -> 
>> 2.5 -> 2.6 (inline 2.7 ) -> 2.8
>> inlineD: brute -> call digits_2 -> call 2.1 -> call 2.2 -> 2.3 -> 2.4 -> 2.5 
>> -> 2.6 -> 2.7 -> 2.8
>>
>> Performance diff:
>> inlineB is ~25% faster than inlineA;
>> inlineC is ~20% faster than inlineB;
>> inlineD is ~30% faster than inlineC.
>>
>> The master GCC code now generates inline sequence like inlineB, this patch
>> makes the ipa-inline pass behavior like inlineD by:
>>   1) The growth acumulation for recursive 

Re: [PATCH v5] dse: Remove partial load after full store for high part access[PR71309]

2020-08-05 Thread luoxhu via Gcc-patches

Hi Richard,

On 2020/8/3 22:01, Richard Sandiford wrote:

/* Try a wider mode if truncating the store mode to NEW_MODE
 requires a real instruction.  */
if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
@@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
  && !targetm.modes_tieable_p (new_mode, store_mode))
continue;
  
+  if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)


Like I mentioned before, this should be the other way around:

 multiple_p (shift, GET_MODE_BITSIZE (new_mode))

i.e. for the subreg to be valid, the shift amount must be a multiple
of the outer mode size in bits.

OK with that change, thanks, and sorry for the multiple review cycles.


Just had another question is do we want to backport this patch to gcc-10
and gcc-9 branches? :)


Xionghu




Richard



Re: [PATCH v5] dse: Remove partial load after full store for high part access[PR71309]

2020-08-03 Thread luoxhu via Gcc-patches



On 2020/8/3 22:01, Richard Sandiford wrote:


/* Try a wider mode if truncating the store mode to NEW_MODE
 requires a real instruction.  */
if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
@@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
  && !targetm.modes_tieable_p (new_mode, store_mode))
continue;
  
+  if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)


Like I mentioned before, this should be the other way around:

 multiple_p (shift, GET_MODE_BITSIZE (new_mode))

i.e. for the subreg to be valid, the shift amount must be a multiple
of the outer mode size in bits.

OK with that change, thanks, and sorry for the multiple review cycles.


Appreciate your time and patience to make it better :).
Updated the change and lp64 requirements from Segher and committed by
r11-2526-g265d817b1eb4644c7a9613ad6920315d98e2e0a4, thank you all.

Xionghu



Richard



[PATCH v5] dse: Remove partial load after full store for high part access[PR71309]

2020-08-03 Thread luoxhu via Gcc-patches
Thanks, the v5 update as comments:
 1. Move const_rhs shift out of loop;
 2. Iterate from int size for read_mode.


This patch could optimize(works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp x2, x3, [x0, 16]
stp x2, x3, [x0]
ldr x0, [x0, 8]

=>

mov x1, x0
ldp x2, x0, [x0, 16]
stp x2, x0, [x1]

gcc/ChangeLog:

2020-08-03  Xionghu Luo  

PR rtl-optimization/71309
* dse.c (find_shift_sequence): Use subreg of shifted from high part
register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-08-03  Xionghu Luo  

PR rtl-optimization/71309
* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c  | 78 ++
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 +
 2 files changed, 82 insertions(+), 29 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..b67b5c2ba35 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1720,6 +1720,35 @@ find_shift_sequence (poly_int64 access_size,
   scalar_int_mode new_mode;
   rtx read_reg = NULL;
 
+  /* If a constant was stored into memory, try to simplify it here,
+ otherwise the cost of the shift might preclude this optimization
+ e.g. at -Os, even when no actual shift will be needed.  */
+  if (store_info->const_rhs)
+{
+  auto new_mode = smallest_int_mode_for_size (access_size * BITS_PER_UNIT);
+  auto byte = subreg_lowpart_offset (new_mode, store_mode);
+  rtx ret
+   = simplify_subreg (new_mode, store_info->const_rhs, store_mode, byte);
+  if (ret && CONSTANT_P (ret))
+   {
+ rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
+ ret = simplify_const_binary_operation (LSHIFTRT, new_mode, ret,
+shift_rtx);
+ if (ret && CONSTANT_P (ret))
+   {
+ byte = subreg_lowpart_offset (read_mode, new_mode);
+ ret = simplify_subreg (read_mode, ret, new_mode, byte);
+ if (ret && CONSTANT_P (ret)
+ && (set_src_cost (ret, read_mode, speed)
+ <= COSTS_N_INSNS (1)))
+   return ret;
+   }
+   }
+}
+
+  if (require_cst)
+return NULL_RTX;
+
   /* Some machines like the x86 have shift insns for each size of
  operand.  Other machines like the ppc or the ia-64 may only have
  shift insns that shift values within 32 or 64 bit registers.
@@ -1729,7 +1758,7 @@ find_shift_sequence (poly_int64 access_size,
 
   opt_scalar_int_mode new_mode_iter;
   FOR_EACH_MODE_FROM (new_mode_iter,
- smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
+ smallest_int_mode_for_size (GET_MODE_BITSIZE (read_mode)))
 {
   rtx target, new_reg, new_lhs;
   rtx_insn *shift_seq, *insn;
@@ -1739,34 +1768,6 @@ find_shift_sequence (poly_int64 access_size,
   if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
break;
 
-  /* If a constant was stored into memory, try to simplify it here,
-otherwise the cost of the shift might preclude this optimization
-e.g. at -Os, even when no actual shift will be needed.  */
-  if (store_info->const_rhs)
-   {
- poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
- rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
-store_mode, byte);
- if (ret && CONSTANT_P (ret))
-   {
- rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
- ret = simplify_const_binary_operation (LSHIFTRT, new_mode,
-ret, shift_rtx);
- if (ret && CONSTANT_P (ret))
-   {
- byte = subreg_lowpart_offset (read_mode, new_mode);
- ret = simplify_subreg (read_mode, ret, new_mode, byte);
- if (ret && CONSTANT_P (ret)
- && (set_src_cost (ret, read_mode, speed)
- <= COSTS_N_INSNS (1)))
-   return ret;
-   }
-   }
-   }
-
-  if (require_cst)
-   return NULL_RTX;
-
   /* Try a wider mode if truncating the store mode to NEW_MODE
 requires a real instruction.  */
   if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
@@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
  && !targetm.modes_tieable_p (new_mode, 

Re: [PATCH v4] dse: Remove partial load after full store for high part access[PR71309]

2020-07-28 Thread luoxhu via Gcc-patches

Gentle ping in case this mail is missed, Thanks :)

https://gcc.gnu.org/pipermail/gcc-patches/2020-July/550602.html

Xionghu


On 2020/7/24 18:47, luoxhu via Gcc-patches wrote:

Hi Richard,

This is the updated version that could pass all regression test on
Power9-LE.
Just need another  "maybe_lt (GET_MODE_SIZE (new_mode), access_size)"
before generating shift for store_info->const_rhs to ensure correct
constant is generated, take testsuite/gfortran1/equiv_2.x for example:

equiv_2.x-equiv_2.f90.264r.cse2:
5: r119:DI=0x6867666564636261
6: [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=[sfp:DI+0x21]
20: r125:CC=cmp(r124:SI,r123:SI)

Bad code due to get SImode subreg0 and shift right 8 bits got 0x646362 in
insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
5:  r119:DI=0x6867666564636261
32: r126:SI=0x646362
6:  [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=r126:SI
20: r125:CC=cmp(r124:SI,r123:SI)

Good code, get 0x65646362 in insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
5:  r119:DI=0x6867666564636261
32: r126:SI=0x65646362
6:  [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=r126:SI
20: r125:CC=cmp(r124:SI,r123:SI)

So the new_mode size should also be GE than access_size for const_rhs shift.
It seems a bit more complicated now...



[PATCH v4] dse: Remove partial load after full store for high part 
access[PR71309]


This patch could optimize(works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
   ld 10,16(3)
   mr 9,3
   ld 3,24(3)
   std 10,0(9)
   std 3,8(9)
   blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp x2, x3, [x0, 16]
stp x2, x3, [x0]
ldr x0, [x0, 8]

=>

mov x1, x0
ldp x2, x0, [x0, 16]
stp x2, x0, [x1]

gcc/ChangeLog:

2020-07-24  Xionghu Luo  

PR rtl-optimization/71309
* dse.c (find_shift_sequence): Use subreg of shifted from high part
register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-24  Xionghu Luo  

PR rtl-optimization/71309
* gcc.target/powerpc/pr71309.c: New test.
---
  gcc/dse.c  | 25 ++--
  gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++
  2 files changed, 56 insertions(+), 2 deletions(-)
  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..a7cdc43a182 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1728,8 +1728,7 @@ find_shift_sequence (poly_int64 access_size,
   the machine.  */

opt_scalar_int_mode new_mode_iter;
-  FOR_EACH_MODE_FROM (new_mode_iter,
- smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
+  FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
  {
rtx target, new_reg, new_lhs;
rtx_insn *shift_seq, *insn;
@@ -1744,6 +1743,9 @@ find_shift_sequence (poly_int64 access_size,
 e.g. at -Os, even when no actual shift will be needed.  */
if (store_info->const_rhs)
{
+ if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+   continue;
+
  poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
  rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
 store_mode, byte);
@@ -1779,6 +1781,25 @@ find_shift_sequence (poly_int64 access_size,
  && !targetm.modes_tieable_p (new_mode, store_mode))
continue;

+  if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
+ && known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
+   {
+ /* Try to implement the shift using a subreg.  */
+ poly_int64 offset = subreg_offset_from_lsb (new_mode,
+ store_mode, shift);
+ rtx rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
+   store_mode, offset);
+ if (rhs_subreg)
+   {
+ read_reg
+   = extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
+ break;
+   }
+   }
+
+  if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+   continue;
+
new_reg = gen_reg_rtx (new_mode);

start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c 
b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define 

[PATCH v4] dse: Remove partial load after full store for high part access[PR71309]

2020-07-24 Thread luoxhu via Gcc-patches
Hi Richard,

This is the updated version that could pass all regression test on
Power9-LE. 
Just need another  "maybe_lt (GET_MODE_SIZE (new_mode), access_size)"
before generating shift for store_info->const_rhs to ensure correct
constant is generated, take testsuite/gfortran1/equiv_2.x for example:

equiv_2.x-equiv_2.f90.264r.cse2:
5: r119:DI=0x6867666564636261
6: [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=[sfp:DI+0x21]
20: r125:CC=cmp(r124:SI,r123:SI)

Bad code due to get SImode subreg0 and shift right 8 bits got 0x646362 in
insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
5:  r119:DI=0x6867666564636261
32: r126:SI=0x646362
6:  [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=r126:SI
20: r125:CC=cmp(r124:SI,r123:SI)

Good code, get 0x65646362 in insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
5:  r119:DI=0x6867666564636261
32: r126:SI=0x65646362
6:  [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=r126:SI
20: r125:CC=cmp(r124:SI,r123:SI)

So the new_mode size should also be GE than access_size for const_rhs shift.
It seems a bit more complicated now...



[PATCH v4] dse: Remove partial load after full store for high part 
access[PR71309]


This patch could optimize(works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp x2, x3, [x0, 16]
stp x2, x3, [x0]
ldr x0, [x0, 8]

=>

mov x1, x0
ldp x2, x0, [x0, 16]
stp x2, x0, [x1]

gcc/ChangeLog:

2020-07-24  Xionghu Luo  

PR rtl-optimization/71309
* dse.c (find_shift_sequence): Use subreg of shifted from high part
register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-24  Xionghu Luo  

PR rtl-optimization/71309
* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c  | 25 ++--
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++
 2 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..a7cdc43a182 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1728,8 +1728,7 @@ find_shift_sequence (poly_int64 access_size,
  the machine.  */
 
   opt_scalar_int_mode new_mode_iter;
-  FOR_EACH_MODE_FROM (new_mode_iter,
- smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
+  FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
 {
   rtx target, new_reg, new_lhs;
   rtx_insn *shift_seq, *insn;
@@ -1744,6 +1743,9 @@ find_shift_sequence (poly_int64 access_size,
 e.g. at -Os, even when no actual shift will be needed.  */
   if (store_info->const_rhs)
{
+ if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+   continue;
+
  poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
  rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
 store_mode, byte);
@@ -1779,6 +1781,25 @@ find_shift_sequence (poly_int64 access_size,
  && !targetm.modes_tieable_p (new_mode, store_mode))
continue;
 
+  if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
+ && known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
+   {
+ /* Try to implement the shift using a subreg.  */
+ poly_int64 offset = subreg_offset_from_lsb (new_mode,
+ store_mode, shift);
+ rtx rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
+   store_mode, offset);
+ if (rhs_subreg)
+   {
+ read_reg
+   = extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
+ break;
+   }
+   }
+
+  if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+   continue;
+
   new_reg = gen_reg_rtx (new_mode);
 
   start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c 
b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define TYPE2 void*
+
+struct path {
+TYPE2 mnt;
+TYPE dentry;
+};
+
+struct nameidata {
+struct path path;
+struct path root;
+};
+
+__attribute__ ((noinline))
+TYPE foo(struct nameidata *nd)
+{
+  TYPE d;
+  TYPE2 d2;
+
+  nd->path = 

Re: [PATCH v3] dse: Remove partial load after full store for high part access[PR71309]

2020-07-23 Thread luoxhu via Gcc-patches



On 2020/7/23 04:30, Richard Sandiford wrote:
> 
> I now realise the reason is that the starting mode is too wide.
> I think we should fix that by doing:
> 
>FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
>  {
>…
> 
> and then add:
> 
>if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
>  continue;
> 
> after your optimisation, so that the shift code proper still only
> considers modes that are wide enough to hold the unshifted value.
> 
> I don't think there are any efficiency concerns with that, since
> smallest_int_mode_for_size does its own similar iteration internally.
> 
> Sorry for not picking up on that first time.
> 

Thanks:), I didn't make it clear that it starts from TImode first...

The updated patch use "FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)"
to iterate from QImode now, but still need size "new_mode <= store_mode"
to get legal subreg, otherwise "gfortran.dg/shift-kind.f90" will fail.
Even so, there are still below execute fail when running regression tests:

gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O2
gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O2 -fbounds-check
gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O2 
-fomit-frame-pointer -finline-functions
gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O2 
-fomit-frame-pointer -finline-functions -funroll-loops
gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O3 -g
gfortran.fortran-torture/execute/equiv_2.f90 execution, -O2 -ftree-vectorize 
-maltivec 

Any clue about the execution outputs "STOP 2", please? Still investigating.

PS: if switch the sequence of GET_MODE_BITSIZE (new_mode) and shift in 
multiple_p,
it will generates incorrect ASM for PR71309.


This patch could optimize (works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp x2, x3, [x0, 16]
stp x2, x3, [x0]
ldr x0, [x0, 8]

=>

mov x1, x0
ldp x2, x0, [x0, 16]
stp x2, x0, [x1]

gcc/ChangeLog:

2020-07-23  Xionghu Luo  

PR rtl-optimization/71309
* dse.c (find_shift_sequence): Use subreg of shifted from high part
register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-23  Xionghu Luo  

PR rtl-optimization/71309
* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c  | 22 +--
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++
 2 files changed, 53 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..afc6ad30623 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1728,8 +1728,7 @@ find_shift_sequence (poly_int64 access_size,
  the machine.  */
 
   opt_scalar_int_mode new_mode_iter;
-  FOR_EACH_MODE_FROM (new_mode_iter,
- smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
+  FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
 {
   rtx target, new_reg, new_lhs;
   rtx_insn *shift_seq, *insn;
@@ -1779,6 +1778,25 @@ find_shift_sequence (poly_int64 access_size,
  && !targetm.modes_tieable_p (new_mode, store_mode))
continue;
 
+  if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
+ && known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
+   {
+ /* Try to implement the shift using a subreg.  */
+ poly_int64 offset
+   = subreg_offset_from_lsb (new_mode, store_mode, shift);
+ rtx rhs_subreg
+   = simplify_gen_subreg (new_mode, store_info->rhs, store_mode, 
offset);
+ if (rhs_subreg)
+   {
+ read_reg
+   = extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
+ break;
+   }
+   }
+
+  if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+   continue;
+
   new_reg = gen_reg_rtx (new_mode);
 
   start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c 
b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define TYPE2 void*
+
+struct path {
+TYPE2 mnt;
+TYPE dentry;
+};
+
+struct nameidata {
+struct path path;
+struct path root;
+};
+
+__attribute__ ((noinline))
+TYPE foo(struct nameidata *nd)
+{
+  TYPE d;
+  

Re: [PATCH v3] dse: Remove partial load after full store for high part access[PR71309]

2020-07-22 Thread luoxhu via Gcc-patches
Hi,

On 2020/7/22 19:05, Richard Sandiford wrote:
> This wasn't really what I meant.  Using subregs is fine, but I was
> thinking of:
> 
>/* Also try a wider mode if the necessary punning is either not
>desirable or not possible.  */
>if (!CONSTANT_P (store_info->rhs)
> && !targetm.modes_tieable_p (new_mode, store_mode))
>   continue;
> 
>if (multiple_p (shift, GET_MODE_BITSIZE (new_mode)))
>   {
> /* Try to implement the shift using a subreg.  */
> poly_int64 offset = subreg_offset_from_lsb (new_mode, store_mode,
> shift);
> rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
>   store_mode, offset);
> if (rhs_subreg)
>   {
> ...
> break;
>   }
>   }
> 
> where the rhs_subreg is from your original patch.
> 
> The multiple_p should be that way round: the shift needs to be a
> multiple of the new_mode for the subreg to be valid.
> 
> I think this should also avoid the BITS_PER_WORD problem.  On the
> other hand, I agree BITS_PER_UNIT isn't a very sensible limit if
> we're using subregs, so maybe moving it to after the multiple_p
> if block would still make sense.
> 

Thanks, I took that rhs_subreg part back for the v3 patch and updated a bit
based on your prototype, shift should be put in op1 as multiple_p requires
op0 >= op1. 

Then, new_mode is still TImode same to store_mode, offset will return 8 when
shift is 64,  simplify_gen_subreg needs an additional inner_mode(DImode) 
generated from "smallest_int_mode_for_size (shift)" to get rhs_subreg, 
otherwise it will return NULL if new_mode is equal to store_mode.

Lastly, move the BITS_PER_UNIT after multiple_p as it still need generate
shift_seq for other circumstances. :)


[PATCH v3] dse: Remove partial load after full store for high part 
access[PR71309]


This patch could optimize (works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp x2, x3, [x0, 16]
stp x2, x3, [x0]
ldr x0, [x0, 8]

=>

mov x1, x0
ldp x2, x0, [x0, 16]
stp x2, x0, [x1]

gcc/ChangeLog:

2020-07-22  Xionghu Luo  

PR rtl-optimization/71309
* dse.c (find_shift_sequence): Use subreg of shifted from high part
register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-22  Xionghu Luo  

PR rtl-optimization/71309
* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c  | 21 --
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++
 2 files changed, 52 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..aaa161237c3 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1736,8 +1736,6 @@ find_shift_sequence (poly_int64 access_size,
   int cost;
 
   new_mode = new_mode_iter.require ();
-  if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
-   break;
 
   /* If a constant was stored into memory, try to simplify it here,
 otherwise the cost of the shift might preclude this optimization
@@ -1779,6 +1777,25 @@ find_shift_sequence (poly_int64 access_size,
  && !targetm.modes_tieable_p (new_mode, store_mode))
continue;
 
+  if (multiple_p (GET_MODE_BITSIZE (new_mode), shift))
+   {
+ /* Try to implement the shift using a subreg.  */
+ scalar_int_mode inner_mode = smallest_int_mode_for_size (shift);
+ poly_int64 offset
+   = subreg_offset_from_lsb (new_mode, store_mode, shift);
+ rtx rhs_subreg = simplify_gen_subreg (inner_mode, store_info->rhs,
+   store_mode, offset);
+ if (rhs_subreg)
+   {
+ read_reg = extract_low_bits (read_mode, inner_mode,
+  copy_rtx (rhs_subreg));
+ break;
+   }
+   }
+
+  if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
+   break;
+
   new_reg = gen_reg_rtx (new_mode);
 
   start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c 
b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define 

Re: [PATCH v2] dse: Remove partial load after full store for high part access[PR71309]

2020-07-22 Thread luoxhu via Gcc-patches
Hi,

On 2020/7/21 23:30, Richard Sandiford wrote:
> Xiong Hu Luo  writes:>> @@ -1872,9 +1872,27 @@ 
> get_stored_val (store_info *store_info, machine_mode read_mode,
>>   {
>> poly_int64 shift = gap * BITS_PER_UNIT;
>> poly_int64 access_size = GET_MODE_SIZE (read_mode) + gap;
>> -  read_reg = find_shift_sequence (access_size, store_info, read_mode,
>> -  shift, optimize_bb_for_speed_p (bb),
>> -  require_cst);
>> +  rtx rhs_subreg = NULL;
>> +
>> +  if (known_eq (GET_MODE_BITSIZE (store_mode), shift * 2))
>> +{
>> +  scalar_int_mode inner_mode = smallest_int_mode_for_size (shift);
>> +  poly_uint64 sub_off
>> += ((!BYTES_BIG_ENDIAN)
>> + ? GET_MODE_SIZE (store_mode) - GET_MODE_SIZE (inner_mode)
>> + : 0);
>> +
>> +  rhs_subreg = simplify_gen_subreg (inner_mode, store_info->rhs,
>> +store_mode, sub_off);
>> +  if (rhs_subreg)
>> +read_reg
>> +  = extract_low_bits (read_mode, inner_mode, copy_rtx (rhs_subreg));
>> +}
>> +
>> +  if (read_reg == NULL)
>> +read_reg
>> +  = find_shift_sequence (access_size, store_info, read_mode, shift,
>> + optimize_bb_for_speed_p (bb), require_cst);
> 
> Did you consider doing this in find_shift_sequence instead?
> ISTM that this is really using subregs to optimise:
> 
>/* In theory we could also check for an ashr.  Ian Taylor knows
>of one dsp where the cost of these two was not the same.  But
>this really is a rare case anyway.  */
>target = expand_binop (new_mode, lshr_optab, new_reg,
>gen_int_shift_amount (new_mode, shift),
>new_reg, 1, OPTAB_DIRECT);
> 
> I think everything up to:
> 
>/* Also try a wider mode if the necessary punning is either not
>desirable or not possible.  */
>if (!CONSTANT_P (store_info->rhs)
> && !targetm.modes_tieable_p (new_mode, store_mode))
>   continue;
> 
> is either neutral or helpful for the subreg case too, so maybe
> we could just add the optimisation after that.  (It probably isn't
> worth reusing any of the later loop body code though, since the
> subreg case is much simpler.)
> 
> I don't think we need to restrict this case to modes of size
> shift * 2.  We can just check whether the shift is a multiple of
> the new_mode calculated by find_shift_sequence (using multiple_p).
> 
> An easier way of converting the shift to a subreg byte offset
> is to use subreg_offset_from_lsb, which also handles
> BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN.
> 

Thanks, I've updated the patch by moving it into find_shift_sequence.
Not sure whether meets your comments precisely though it still works:)
There is a comment mentioned that 

  /* Some machines like the x86 have shift insns for each size of
 operand.  Other machines like the ppc or the ia-64 may only have
 shift insns that shift values within 32 or 64 bit registers.
 This loop tries to find the smallest shift insn that will right
 justify the value we want to read but is available in one insn on
 the machine.  */

So it will early break without some additional check as the new_mode is
TImode here:

  if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
break;



[PATCH v2] dse: Remove partial load after full store for high part 
access[PR71309]


This patch could optimize (works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
18: r122:TI=r119:TI
16: r123:TI#0=r122:TI#8 0>>0
17: r123:TI#8=0
19: r124:DI=r123:TI#0
7: [r118:DI]=r119:TI
8: r121:DI=r124:DI

Final ASM will be as below without partial load after full store(stxv+ld):
  mr 9,3
  ld 3,24(3)
  ld 10,16(3)
  std 3,8(9)
  std 10,0(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression testing on Power9-LE.

For AArch64, one ldr is replaced by mov:

ldp x2, x3, [x0, 16]
stp x2, x3, [x0]
ldr x0, [x0, 8]

=>

mov x1, x0
ldp x2, x0, [x0, 16]
stp x2, x0, [x1]

gcc/ChangeLog:

2020-07-22  Xionghu Luo  

PR rtl-optimization/71309
* dse.c (find_shift_sequence): Use subreg of shifted from high part
register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-22  Xionghu Luo  

PR rtl-optimization/71309
* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c  | 15 +-
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++
 2 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..e06a9fbb0cd 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1736,7 +1736,8 @@ find_shift_sequence (poly_int64 

Re: [PATCH] rs6000: Define movsf_from_si2 to extract high part SF element from DImode[PR89310]

2020-07-20 Thread luoxhu via Gcc-patches

On 2020/7/20 23:31, Segher Boessenkool wrote:

On Mon, Jul 13, 2020 at 02:30:28PM +0800, luoxhu wrote:

For extracting high part element from DImode register like:

{%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}

split it before reload with "and mask" to avoid generating shift right
32 bit then shift left 32 bit.  This pattern also exists in PR42475 and
PR67741, etc.

srdi 3,3,32
sldi 9,3,32
mtvsrd 1,9
xscvspdpn 1,1

=>

rldicr 3,3,0,31
mtvsrd 1,3
xscvspdpn 1,1



* config/rs6000/rs6000.md (movsf_from_si2): New
define_insn_and_split.


(That fits on one line).


--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr89310.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */



+/* { dg-final { scan-assembler-not {\msrdi\M} } } */
+/* { dg-final { scan-assembler-not {\msldi\M} } } */
+/* { dg-final { scan-assembler-times {\mrldicr\M} 1 } } */


I'm not sure that works on older cpus?  Please test there, and add
-mdejagnu-cpu=power8 to the dg-options if needed.  Also test on BE please.

Okay for trunk with those last details taking care of.  Thank you!


Thanks for the remind.  Addressed the comments and committed in r11-2245.

Xionghu


Re: [PATCH] rs6000: Define movsf_from_si2 to extract high part SF element from DImode[PR89310]

2020-07-14 Thread luoxhu via Gcc-patches
Hi David,

On 2020/7/14 22:17, David Edelsohn wrote:
> Unfortunately this patch is eliciting a number of new testsuite
> failures, all like
> 
> error: unrecognizable insn:
> (insn 44 43 45 5 (parallel [
>  (set (reg:SI 199)
>  (unspec:SI [
>  (reg:SF 202)
>  ] UNSPEC_SI_FROM_SF))
>  (clobber (scratch:V4SF))
>  ]) 
> "/nasfarm/edelsohn/src/src/gcc/testsuite/gcc.dg/vect/vect-alias-check-11.c":70:1
> -1
>   (nil))
> during RTL pass: vregs
> 
> for
> 
> gcc.dg/vect/vect-alias-check-11.c
> gcc.dg/vect/vect-alias-check-12.c
> gcc.dg/vect/pr57741-2.c
> gcc.dg/vect/pr57741-3.c
> gcc.dg/vect/pr89440.c
> gcc.target/powerpc/sse-movss-1.c

This patch won't match the instruction with a "clobber (scratch:V4SF)",
it only matches "(clobber (match_scratch:DI 2 "=r"))", I guess you are
replying to the other patch?

"[PATCH 2/2] rs6000: Define define_insn_and_split to split unspec sldi+or to 
rldimi"

Thanks for your fix patch! :)

This patch's regression tested pass on Power8-LE, I re-run these cases on
Power8-LE, and confirmed these could pass, what is your platform please?
BTW, TARGET_NO_SF_SUBREG ensured TARGET_POWERPC64 for this 
define_insn_and_split.
Thanks.

Xionghu

> 
> Thanks, David
> 
> On Mon, Jul 13, 2020 at 2:30 AM luoxhu  wrote:
>>
>> Hi,
>>
>> On 2020/7/11 08:54, Segher Boessenkool wrote:
>>> Hi!
>>>
>>> On Fri, Jul 10, 2020 at 09:39:40AM +0800, luoxhu wrote:
 OK, seems the md file needs a format tool too...
>>>
>>> Heh.  Just make sure it looks good (that is, does what it looks like),
>>> looks like the rest, etc.  It's hard to do anything nice with unspecs,
>>> [ ] lists do not format well.
>>>
>> +  "TARGET_NO_SF_SUBREG"
>> +  "#"
>> +  "&& vsx_reg_sfsubreg_ok (operands[0], SFmode)"
>
> Put this in the insn condition?  And since this is just a predicate,
> you can just use it instead of gpc_reg_operand.
>
> (The split condition becomes "&& 1" then, not "").

 OK, this seems a bit strange as movsi_from_sf, movsf_from_si and
 movdi_from_sf_zero_ext all use it as condition...
>>>
>>> Since in your case you *always* split, the split condition should be
>>> "always".  There are no alternatives that do not split here.
>>>
 And why vsx_reg_sfsubreg_ok allows "SF SUBREGS" and TARGET_NO_SF_SUBREG
 "avoid (SUBREG:SF (REG:SI)", I guess they are not the same meaning? (The
 TARGET_NO_SF_SUBREG is also copied from other similar defines.)  Thanks.
>>>
>>> Good question.  I do not know.
>>>
>>> Well...  Since this define_insn* requires p8 *anyway*, we do not need
>>> any of these sf_subreg things?  We always know for each one if it should
>>> be true or false.
>>
>> Yes, removed the vsx_reg_sfsubreg_ok check.
>>
>>>
 +  "TARGET_NO_SF_SUBREG"
>>>
>>> But here we should require p8 some other way, then.
>>
>> TARGET_NO_SF_SUBREG is defined to TARGET_DIRECT_MOVE_64BIT, and
>> TARGET_DIRECT_MOVE_64BIT is TARGET_DIRECT_MOVE && TARGET_P8_VECTOR && 
>> TARGET_POWERPC64
>> which means TARGET_P8_VECTOR must be true for TARGET_NO_SF_SUBREG.
>>
>>>
 +  (set_attr "isa" "p8v")])
>>>
>>> (This isn't enough, unfortunately).
>>>
>>
>>
>> Updated patch to removed the vsx_reg_sfsubreg_ok and ICE fix:
>>
>>
>> For extracting high part element from DImode register like:
>>
>> {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
>>
>> split it before reload with "and mask" to avoid generating shift right
>> 32 bit then shift left 32 bit.  This pattern also exists in PR42475 and
>> PR67741, etc.
>>
>> srdi 3,3,32
>> sldi 9,3,32
>> mtvsrd 1,9
>> xscvspdpn 1,1
>>
>> =>
>>
>> rldicr 3,3,0,31
>> mtvsrd 1,3
>> xscvspdpn 1,1
>>
>> Bootstrap and regression tested pass on Power8-LE.
>>
>> gcc/ChangeLog:
>>
>> 2020-07-13  Xionghu Luo  
>>
>>  PR rtl-optimization/89310
>>  * config/rs6000/rs6000.md (movsf_from_si2): New
>>  define_insn_and_split.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2020-07-13  Xionghu Luo  
>>
>>  PR rtl-optimization/89310
>>  * gcc.target/powerpc/pr89310.c: New test.
>> ---
>>   gcc/config/rs6000/rs6000.md| 31 ++
>>   gcc/testsuite/gcc.target/powerpc/pr89310.c | 17 
>>   2 files changed, 48 insertions(+)
>>   create mode 100644 gcc/testsuite/gcc.target/powerpc/pr89310.c
>>
>> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
>> index 4fcd6a94022..480385ed4d2 100644
>> --- a/gcc/config/rs6000/rs6000.md
>> +++ b/gcc/config/rs6000/rs6000.md
>> @@ -7593,6 +7593,37 @@ (define_insn_and_split "movsf_from_si"
>>  "*,  *, p9v,   p8v,   *, *,
>>   p8v,p8v,   p8v,   *")])
>>
>> +;; For extracting high part element from DImode register like:
>> +;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
>> +;; split it before reload with "and mask" to avoid generating shift right
>> +;; 32 bit then shift 

Re: [PATCH] rs6000: Define movsf_from_si2 to extract high part SF element from DImode[PR89310]

2020-07-13 Thread luoxhu via Gcc-patches
Hi,

On 2020/7/11 08:54, Segher Boessenkool wrote:
> Hi!
> 
> On Fri, Jul 10, 2020 at 09:39:40AM +0800, luoxhu wrote:
>> OK, seems the md file needs a format tool too...
> 
> Heh.  Just make sure it looks good (that is, does what it looks like),
> looks like the rest, etc.  It's hard to do anything nice with unspecs,
> [ ] lists do not format well.
> 
 +  "TARGET_NO_SF_SUBREG"
 +  "#"
 +  "&& vsx_reg_sfsubreg_ok (operands[0], SFmode)"
>>>
>>> Put this in the insn condition?  And since this is just a predicate,
>>> you can just use it instead of gpc_reg_operand.
>>>
>>> (The split condition becomes "&& 1" then, not "").
>>
>> OK, this seems a bit strange as movsi_from_sf, movsf_from_si and
>> movdi_from_sf_zero_ext all use it as condition...
> 
> Since in your case you *always* split, the split condition should be
> "always".  There are no alternatives that do not split here.
> 
>> And why vsx_reg_sfsubreg_ok allows "SF SUBREGS" and TARGET_NO_SF_SUBREG
>> "avoid (SUBREG:SF (REG:SI)", I guess they are not the same meaning? (The
>> TARGET_NO_SF_SUBREG is also copied from other similar defines.)  Thanks.
> 
> Good question.  I do not know.
> 
> Well...  Since this define_insn* requires p8 *anyway*, we do not need
> any of these sf_subreg things?  We always know for each one if it should
> be true or false.

Yes, removed the vsx_reg_sfsubreg_ok check.

> 
>> +  "TARGET_NO_SF_SUBREG"
> 
> But here we should require p8 some other way, then.

TARGET_NO_SF_SUBREG is defined to TARGET_DIRECT_MOVE_64BIT, and
TARGET_DIRECT_MOVE_64BIT is TARGET_DIRECT_MOVE && TARGET_P8_VECTOR && 
TARGET_POWERPC64
which means TARGET_P8_VECTOR must be true for TARGET_NO_SF_SUBREG.

> 
>> +  (set_attr "isa" "p8v")])
> 
> (This isn't enough, unfortunately).
> 


Updated patch to removed the vsx_reg_sfsubreg_ok and ICE fix:


For extracting high part element from DImode register like:

{%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}

split it before reload with "and mask" to avoid generating shift right
32 bit then shift left 32 bit.  This pattern also exists in PR42475 and
PR67741, etc.

srdi 3,3,32
sldi 9,3,32
mtvsrd 1,9
xscvspdpn 1,1

=>

rldicr 3,3,0,31
mtvsrd 1,3
xscvspdpn 1,1

Bootstrap and regression tested pass on Power8-LE.

gcc/ChangeLog:

2020-07-13  Xionghu Luo  

PR rtl-optimization/89310
* config/rs6000/rs6000.md (movsf_from_si2): New
define_insn_and_split.

gcc/testsuite/ChangeLog:

2020-07-13  Xionghu Luo  

PR rtl-optimization/89310
* gcc.target/powerpc/pr89310.c: New test.
---
 gcc/config/rs6000/rs6000.md| 31 ++
 gcc/testsuite/gcc.target/powerpc/pr89310.c | 17 
 2 files changed, 48 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr89310.c

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 4fcd6a94022..480385ed4d2 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -7593,6 +7593,37 @@ (define_insn_and_split "movsf_from_si"
"*,  *, p9v,   p8v,   *, *,
 p8v,p8v,   p8v,   *")])
 
+;; For extracting high part element from DImode register like:
+;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
+;; split it before reload with "and mask" to avoid generating shift right
+;; 32 bit then shift left 32 bit.
+(define_insn_and_split "movsf_from_si2"
+  [(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
+   (unspec:SF
+[(subreg:SI
+  (ashiftrt:DI
+   (match_operand:DI 1 "input_operand" "r")
+   (const_int 32))
+  0)]
+UNSPEC_SF_FROM_SI))
+  (clobber (match_scratch:DI 2 "=r"))]
+  "TARGET_NO_SF_SUBREG"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (GET_CODE (operands[2]) == SCRATCH)
+operands[2] = gen_reg_rtx (DImode);
+
+  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << 32);
+  emit_insn (gen_anddi3 (operands[2], operands[1], mask));
+  emit_insn (gen_p8_mtvsrd_sf (operands[0], operands[2]));
+  emit_insn (gen_vsx_xscvspdpn_directmove (operands[0], operands[0]));
+  DONE;
+}
+  [(set_attr "length" "12")
+  (set_attr "type" "vecfloat")
+  (set_attr "isa" "p8v")])
 
 ;; Move 64-bit binary/decimal floating point
 (define_expand "mov"
diff --git a/gcc/testsuite/gcc.target/powerpc/pr89310.c 
b/gcc/testsuite/gcc.target/powerpc/pr89310.c
new file mode 100644
index 000..15e78509246
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr89310.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+struct s {
+  int i;
+  float f;
+};
+
+float
+foo (struct s arg)
+{
+  return arg.f;
+}
+
+/* { dg-final { scan-assembler-not {\msrdi\M} } } */
+/* { dg-final { scan-assembler-not {\msldi\M} } } */
+/* { dg-final { scan-assembler-times {\mrldicr\M} 1 } } */
-- 
2.21.0.777.g83232e3864




Re: [PATCH 2/2] rs6000: Define define_insn_and_split to split unspec sldi+or to rldimi

2020-07-12 Thread luoxhu via Gcc-patches




On 2020/7/11 08:28, Segher Boessenkool wrote:

Hi!

On Thu, Jul 09, 2020 at 09:14:45PM -0500, Xiong Hu Luo wrote:

* config/rs6000/rs6000.md (rotl_unspec): New
define_insn_and_split.



+; rldimi with UNSPEC_SI_FROM_SF.
+(define_insn_and_split "*rotl_unspec"


Please have rotldi3_insert in the name.  "unspec" in the name doesn't
really mean much...  Can you put "sf" in the name, instead?  So something
like "*rotldi3_insert_sf"?


--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vector_float.c
@@ -0,0 +1,14 @@
+/* { dg-do compile  } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */


This needs p9vector_ok (yes, that name doesn't make too much sense).


+vector float
+test (float *a, float *b, float *c, float *d)
+{
+  return (vector float){*a, *b, *c, *d};
+}
+
+/* { dg-final { scan-assembler-not {\mlxsspx\M} } } */
+/* { dg-final { scan-assembler-not {\mlfs\M} } } */


No lxssp or lfsx either...  or the update forms...

/* { dg-final { scan-assembler-not {\mlxssp} } } */
/* { dg-final { scan-assembler-not {\mlfs} } } */

works fine (there are no other mnemonics starting with those strings).


+/* { dg-final { scan-assembler-times {\mlwz\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mrldimi\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mmtvsrdd\M} 1 } } */


Okay for trunk with those changes (or post again if you prefer).  Thanks!



Thanks.  The 2 patches are committed to trunk(r11-2043, r11-2044) after
modifications.

Xionghu


Re: [PATCH] rs6000: Split movsf_from_si from high word before reload[PR89310]

2020-07-10 Thread luoxhu via Gcc-patches



On 2020/7/10 03:25, Segher Boessenkool wrote:
> 
>> +  "TARGET_NO_SF_SUBREG"
>> +  "#"
>> +  "&& vsx_reg_sfsubreg_ok (operands[0], SFmode)"
> 
> Put this in the insn condition?  And since this is just a predicate,
> you can just use it instead of gpc_reg_operand.
> 
> (The split condition becomes "&& 1" then, not "").
> 

+(define_insn_and_split "movsf_from_si2"
+  [(set (match_operand:SF 0 "vsx_reg_sfsubreg_ok" "=wa")
+   (unspec:SF
+[(subreg:SI
+  (ashiftrt:DI
+   (match_operand:DI 1 "input_operand" "r")
+   (const_int 32))
+  0)]
+UNSPEC_SF_FROM_SI))
+  (clobber (match_scratch:DI 2 "=r"))]
+  "TARGET_NO_SF_SUBREG"
+  "#"
+  "&& 1"
+  [(const_int 0)]

This change is invalid as it will cause an ICE in PR42475.c, reason is:
insn #29 will be combined to insn #40, though split1 is success, but it 
will cause ICE in sched1 as the op[0] is not SFmode.  Without this, #29
won't be combined to #40 as the gpc_reg_operand (operands[0], E_SFmode)
will cause the match fail for subreg:SF (reg:SI 155 [ _4 ]) 0).


pr42475.c.268r.combine:
Trying 29 -> 40:
   29: {r120:SF=unspec[r133:DI>>0x20#0] 86;clobber scratch;}
   40: r155:SI#0=r120:SF
  REG_DEAD r120:SF
Successfully matched this instruction:
(set (subreg:SF (reg:SI 155 [ _4 ]) 0)
(unspec:SF [
(subreg:SI (ashiftrt:DI (reg:DI 133)
(const_int 32 [0x20])) 0)
] UNSPEC_SF_FROM_SI))
allowing combination of insns 29 and 40
original costs 12 + 4 = 16
replacement cost 12
deferring deletion of insn with uid = 29.
modifying insn i340: {r155:SI#0=unspec[r133:DI>>0x20#0] 86;clobber scratch;}
  REG_DEAD r133:DI
deferring rescan insn with uid = 40.


pr42475.c.273r.split1:
69: r172:DI=r133:DI&0x
70: r155:SI#0=unspec[r172:DI] 62
71: r155:SI#0=unspec[r155:SI#0] 103
41: NOTE_INSN_DELETED
42: r157:DI=r155:SI#0<<0x20


pr42475.c.280r.sched1:
pr42475.c: In function ‘bar’:
pr42475.c:20:1: error: unrecognizable insn:
   20 | }
  | ^
(insn 71 70 41 2 (set (subreg:SF (reg:SI 155 [ _4 ]) 0)
(unspec:SF [
(subreg:SF (reg:SI 155 [ _4 ]) 0)
] UNSPEC_VSX_CVSPDPN)) -1
 (nil))
during RTL pass: sched1
dump file: pr42475.c.280r.sched1
pr42475.c:20:1: internal compiler error: in extract_insn, at recog.c:2294
Please submit a full bug report,
with preprocessed source if appropriate.
See  for instructions.


VS not using vsx_reg_sfsubreg_ok as condition:


pr42475.c.268r.combine:
Trying 29 -> 40:
   29: {r120:SF=unspec[r133:DI>>0x20#0] 86;clobber scratch;}
   40: r155:SI#0=r120:SF
  REG_DEAD r120:SF
Failed to match this instruction:
(set (subreg:SF (reg:SI 155 [ _4 ]) 0)
(unspec:SF [
(subreg:SI (ashiftrt:DI (reg:DI 133)
(const_int 32 [0x20])) 0)
] UNSPEC_SF_FROM_SI))


273r.split1:
69: r172:DI=r133:DI&0x
70: r120:SF=unspec[r172:DI] 62
71: r120:SF=unspec[r120:SF] 103
40: r155:SI#0=r120:SF
REG_DEAD r120:SF


Re: [PATCH 1/2] rs6000: Init V4SF vector without converting SP to DP

2020-07-09 Thread luoxhu via Gcc-patches
Update patch to keep the logic for non TARGET_P8_VECTOR targets.
Please ignore the previous [PATCH 1/2], Sorry!


Move V4SF to V4SI, init vector like V4SI and move to V4SF back.
Better instruction sequence could be generated on Power9:

lfs + xxpermdi + xvcvdpsp + vmrgew
=>
lwz + (sldi + or) + mtvsrdd

With the patch followed, it could be continue optimized to:

lwz + rldimi + mtvsrdd

The point is to use lwz to avoid converting the single-precision to
double-precision upon load, pack four 32-bit data into one 128-bit
register directly.

gcc/ChangeLog:

2020-07-10  Xionghu Luo  

* config/rs6000/rs6000.c (rs6000_expand_vector_init):
Move V4SF to V4SI, init vector like V4SI and move to V4SF back.
---
 gcc/config/rs6000/rs6000.c | 55 +-
 1 file changed, 37 insertions(+), 18 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 58f5d780603..00972fb5165 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -6423,29 +6423,48 @@ rs6000_expand_vector_init (rtx target, rtx vals)
}
   else
{
- rtx dbl_even = gen_reg_rtx (V2DFmode);
- rtx dbl_odd  = gen_reg_rtx (V2DFmode);
- rtx flt_even = gen_reg_rtx (V4SFmode);
- rtx flt_odd  = gen_reg_rtx (V4SFmode);
- rtx op0 = force_reg (SFmode, XVECEXP (vals, 0, 0));
- rtx op1 = force_reg (SFmode, XVECEXP (vals, 0, 1));
- rtx op2 = force_reg (SFmode, XVECEXP (vals, 0, 2));
- rtx op3 = force_reg (SFmode, XVECEXP (vals, 0, 3));
-
- /* Use VMRGEW if we can instead of doing a permute.  */
  if (TARGET_P8_VECTOR)
{
- emit_insn (gen_vsx_concat_v2sf (dbl_even, op0, op2));
- emit_insn (gen_vsx_concat_v2sf (dbl_odd, op1, op3));
- emit_insn (gen_vsx_xvcvdpsp (flt_even, dbl_even));
- emit_insn (gen_vsx_xvcvdpsp (flt_odd, dbl_odd));
- if (BYTES_BIG_ENDIAN)
-   emit_insn (gen_p8_vmrgew_v4sf_direct (target, flt_even, 
flt_odd));
- else
-   emit_insn (gen_p8_vmrgew_v4sf_direct (target, flt_odd, 
flt_even));
+ rtx tmpSF[4];
+ rtx tmpSI[4];
+ rtx tmpDI[4];
+ rtx mrgDI[4];
+ for (i = 0; i < 4; i++)
+   {
+ tmpSI[i] = gen_reg_rtx (SImode);
+ tmpDI[i] = gen_reg_rtx (DImode);
+ mrgDI[i] = gen_reg_rtx (DImode);
+ tmpSF[i] = force_reg (SFmode, XVECEXP (vals, 0, i));
+ emit_insn (gen_movsi_from_sf (tmpSI[i], tmpSF[i]));
+ emit_insn (gen_zero_extendsidi2 (tmpDI[i], tmpSI[i]));
+   }
+
+ if (!BYTES_BIG_ENDIAN)
+   {
+ std::swap (tmpDI[0], tmpDI[1]);
+ std::swap (tmpDI[2], tmpDI[3]);
+   }
+
+ emit_insn (gen_ashldi3 (mrgDI[0], tmpDI[0], GEN_INT (32)));
+ emit_insn (gen_iordi3 (mrgDI[1], mrgDI[0], tmpDI[1]));
+ emit_insn (gen_ashldi3 (mrgDI[2], tmpDI[2], GEN_INT (32)));
+ emit_insn (gen_iordi3 (mrgDI[3], mrgDI[2], tmpDI[3]));
+
+ rtx tmpV2DI = gen_reg_rtx (V2DImode);
+ emit_insn (gen_vsx_concat_v2di (tmpV2DI, mrgDI[1], mrgDI[3]));
+ emit_move_insn (target, gen_lowpart (V4SFmode, tmpV2DI));
}
  else
{
+ rtx dbl_even = gen_reg_rtx (V2DFmode);
+ rtx dbl_odd  = gen_reg_rtx (V2DFmode);
+ rtx flt_even = gen_reg_rtx (V4SFmode);
+ rtx flt_odd  = gen_reg_rtx (V4SFmode);
+ rtx op0 = force_reg (SFmode, XVECEXP (vals, 0, 0));
+ rtx op1 = force_reg (SFmode, XVECEXP (vals, 0, 1));
+ rtx op2 = force_reg (SFmode, XVECEXP (vals, 0, 2));
+ rtx op3 = force_reg (SFmode, XVECEXP (vals, 0, 3));
+
  emit_insn (gen_vsx_concat_v2sf (dbl_even, op0, op1));
  emit_insn (gen_vsx_concat_v2sf (dbl_odd, op2, op3));
  emit_insn (gen_vsx_xvcvdpsp (flt_even, dbl_even));
-- 
2.27.0.90.geebb51ba8c




Re: [PATCH] rs6000: Define movsf_from_si2 to extract high part SF element from DImode[PR89310]

2020-07-09 Thread luoxhu via Gcc-patches
Hi,

On 2020/7/10 03:25, Segher Boessenkool wrote:
> Hi!
> 
> On Thu, Jul 09, 2020 at 11:09:42AM +0800, luoxhu wrote:
>>> Maybe change it back to just SI?  It won't match often at all for QI or
>>> HI anyway, it seems.  Sorry for that detour.  Should be good with the
>>> above nits fixed :-)
>>
>> OK, if I see correctly, subreg of DImode should be SImode and I used
>> subreg:SI to match only SI, so no need to consider mask for QI and HI? :)
> 
> My problem with it was that the shift amounts won't be 32 for QI etc.?

But if the subreg:SI only accepts SImode comes in, why do we need handle the
shift amounts for QI/HI? It not using QHSI here...

> 
>> +;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
>> +;; split it before reload with "and mask" to avoid generating shift right
>> +;; 32 bit then shift left 32 bit.
>> +(define_insn_and_split "movsf_from_si2"
>> +  [(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
>> +(unspec:SF [
>> + (subreg:SI (
>> + ashiftrt:DI (
> 
> Opening parens *start* a line, they never end it.  So:
> 
> (define_insn_and_split "movsf_from_si2"
>[(set (match_operand:SF 0 "vsx_reg_sfsubreg_ok" "=wa")
> (unspec:SF
>   [(subreg:SI
>  (ashiftrt:DI
>(match_operand:DI 1 "input_operand" "r")
>(const_int 32))
> 0)]
>   UNSPEC_SF_FROM_SI))
> (clobber (match_scratch:DI 2 "=r"))]
> 
> or something like that.  There aren't really any real rules...  The
> important points are that nested things should be indented, and things
> at the same level should have the same indent (like the outer set and
> clobber).  The [ for an unspec is sometimes put at the end of a line,
> that reads a little better perhaps.

OK, seems the md file needs a format tool too...

> 
>> +  "TARGET_NO_SF_SUBREG"
>> +  "#"
>> +  "&& vsx_reg_sfsubreg_ok (operands[0], SFmode)"
> 
> Put this in the insn condition?  And since this is just a predicate,
> you can just use it instead of gpc_reg_operand.
> 
> (The split condition becomes "&& 1" then, not "").

OK, this seems a bit strange as movsi_from_sf, movsf_from_si and
movdi_from_sf_zero_ext all use it as condition...

And why vsx_reg_sfsubreg_ok allows "SF SUBREGS" and TARGET_NO_SF_SUBREG
"avoid (SUBREG:SF (REG:SI)", I guess they are not the same meaning? (The 
TARGET_NO_SF_SUBREG is also copied from other similar defines.)  Thanks.


Updated patch and correct the title:


Define movsf_from_si2 to extract high part SF element from DImode[PR89310]


For extracting high part element from DImode register like:

{%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}

split it before reload with "and mask" to avoid generating shift right
32 bit then shift left 32 bit.  This pattern also exists in PR42475 and
PR67741, etc.

srdi 3,3,32
sldi 9,3,32
mtvsrd 1,9
xscvspdpn 1,1

=>

rldicr 3,3,0,31
mtvsrd 1,3
xscvspdpn 1,1

Bootstrap and regression tested pass on Power8-LE.

gcc/ChangeLog:

2020-07-08  Xionghu Luo  

PR rtl-optimization/89310
* config/rs6000/rs6000.md (movsf_from_si2): New
define_insn_and_split.

gcc/testsuite/ChangeLog:

2020-07-08  Xionghu Luo  

PR rtl-optimization/89310
* gcc.target/powerpc/pr89310.c: New test.
---
 gcc/config/rs6000/rs6000.md| 32 ++
 gcc/testsuite/gcc.target/powerpc/pr89310.c | 17 
 2 files changed, 49 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr89310.c

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 4fcd6a94022..2331c84dcbd 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -7593,6 +7593,38 @@ (define_insn_and_split "movsf_from_si"
"*,  *, p9v,   p8v,   *, *,
 p8v,p8v,   p8v,   *")])
 
+;; For extracting high part element from DImode register like:
+;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
+;; split it before reload with "and mask" to avoid generating shift right
+;; 32 bit then shift left 32 bit.
+(define_insn_and_split "movsf_from_si2"
+  [(set (match_operand:SF 0 "vsx_reg_sfsubreg_ok" "=wa")
+   (unspec:SF
+[(subreg:SI
+  (ashiftrt:DI
+   (match_operand:DI 1 "input_operand" "r")
+   (const_int 32))
+  0)]
+UNSPEC_SF_FROM_SI))
+  (clobber (match_scratch:DI 2 "=r"))]
+  "TARGET_NO_SF_SUBREG"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (GET_CODE (operands[2]) == SCRATCH)
+operands[2] = gen_reg_rtx (DImode);
+
+  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << 32);
+  emit_insn (gen_anddi3 (operands[2], operands[1], mask));
+  emit_insn (gen_p8_mtvsrd_sf (operands[0], operands[2]));
+  emit_insn (gen_vsx_xscvspdpn_directmove (operands[0], operands[0]));
+  DONE;
+}
+  [(set_attr "length" "12")
+  (set_attr "type" "vecfloat")
+  (set_attr "isa" "p8v")])
 
 ;; Move 64-bit binary/decimal floating point

Re: [PATCH] rs6000: Split movsf_from_si from high word before reload[PR89310]

2020-07-08 Thread luoxhu via Gcc-patches



On 2020/7/9 06:43, Segher Boessenkool wrote:
> Hi!
> 
> On Wed, Jul 08, 2020 at 11:19:21AM +0800, luoxhu wrote:
>> For extracting high part element from DImode register like:
>>
>> {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
>>
>> split it before reload with "and mask" to avoid generating shift right
>> 32 bit then shift left 32 bit.  This pattern also exists in PR42475 and
>> PR67741, etc.
> 
>> 2020-07-08  Xionghu Luo  
>>
>>  PR rtl-optimization/89310
>>  * config/rs6000/rs6000.md (movsf_from_si2): New
>>  define_insn_and_split.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2020-07-08  Xionghu Luo  
>>
>>  PR rtl-optimization/89310
>>  * gcc.target/powerpc/pr89310.c: New test.
> 
>> +;; For extracting high part element from DImode register like:
>> +;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
>> +;; split it before reload with "and mask" to avoid generating shift right
>> +;; 32 bit then shift left 32 bit.
>> +(define_insn_and_split "movsf_from_si2"
>> +  [(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
>> +(unspec:SF [
>> + (subreg:SI (ashiftrt:DI
> 
> Put the  (ashiftrt:DI  on a separate line as well?  With indent changes,
> etc.
> 
>> +   (match_operand:DI 1 "input_operand" "r")
>> +   (const_int 32))
>> +  0)]
>> + UNSPEC_SF_FROM_SI))
>> +(clobber (match_scratch:DI 2 "=r"))]
>> +  "TARGET_NO_SF_SUBREG"
>> +  "@
>> +  #"
> 
> @ with only one alternative doesn't do anything; so just write
>"#"
> please.
> 
>> +
> 
> And no empty line here.
> 
>> +  "&& !reload_completed
> 
> Why this?  Why will this not work after reload?  In the very few cases
> where you do need this, you usually also need to check for
> lra_in_progress.
> 
>> +   && vsx_reg_sfsubreg_ok (operands[0], SFmode)"
>> +  [(const_int 0)]
>> +{
>> +  if (GET_CODE (operands[2]) == SCRATCH)
>> +operands[2] = gen_reg_rtx (DImode);
>> +
>> +  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << 32);
> 
> The mask should be different for QI and HI.
> 
>> +  emit_insn (gen_anddi3 (operands[2], operands[1], mask));
>> +  emit_insn (gen_p8_mtvsrd_sf (operands[0], operands[2]));
>> +  emit_insn (gen_vsx_xscvspdpn_directmove (operands[0], operands[0]));
>> +  DONE;
>> +}
>> +  [(set_attr "length" "12")
>> +  (set_attr "type" "vecfloat")
>> +  (set_attr "isa" "p8v")])
>> +
> 
> No extra whiteline please.
> 
> 
> Maybe change it back to just SI?  It won't match often at all for QI or
> HI anyway, it seems.  Sorry for that detour.  Should be good with the
> above nits fixed :-)

OK, if I see correctly, subreg of DImode should be SImode and I used
subreg:SI to match only SI, so no need to consider mask for QI and HI? :)

Others are updated: removed reload_completed and adjust format. 


For extracting high part element from DImode register like:

{%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}

split it before reload with "and mask" to avoid generating shift right
32 bit then shift left 32 bit.  This pattern also exists in PR42475 and
PR67741, etc.

srdi 3,3,32
sldi 9,3,32
mtvsrd 1,9
xscvspdpn 1,1

=>

rldicr 3,3,0,31
mtvsrd 1,3
xscvspdpn 1,1

Bootstrap and regression tested pass on Power8-LE.

gcc/ChangeLog:

2020-07-08  Xionghu Luo  

PR rtl-optimization/89310
* config/rs6000/rs6000.md (movsf_from_si2): New
define_insn_and_split.

gcc/testsuite/ChangeLog:

2020-07-08  Xionghu Luo  

PR rtl-optimization/89310
* gcc.target/powerpc/pr89310.c: New test.
---
 gcc/config/rs6000/rs6000.md| 31 ++
 gcc/testsuite/gcc.target/powerpc/pr89310.c | 17 
 2 files changed, 48 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr89310.c

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 4fcd6a94022..a493dfd4596 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -7593,6 +7593,37 @@ (define_insn_and_split "movsf_from_si"
"*,  *, p9v,   p8v,   *, *,
 p8v,p8v,   p8v,   *")])
 
+;; For extracting high part element from DImode register like:
+;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
+;; split it before reload with "and mask" to avoid generating shift right
+;; 32 bit then shift left 32 bit.
+(define_insn_and_split "movsf_from_si2"
+  [(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
+   (unspec:SF [
+(subreg:SI (
+ashiftrt:DI (
+  match_operand:DI 1 "input_operand" "r")
+(const_int 32))
+ 0)]
+UNSPEC_SF_FROM_SI))
+   (clobber (match_scratch:DI 2 "=r"))]
+  "TARGET_NO_SF_SUBREG"
+  "#"
+  "&& vsx_reg_sfsubreg_ok (operands[0], SFmode)"
+  [(const_int 0)]
+{
+  if (GET_CODE (operands[2]) == SCRATCH)
+operands[2] = gen_reg_rtx (DImode);
+
+  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << 32);
+  emit_insn (gen_anddi3 (operands[2], operands[1], 

Re: [PATCH] rs6000: Split movsf_from_si from high word before reload[PR89310]

2020-07-07 Thread luoxhu via Gcc-patches



On 2020/7/8 05:31, Segher Boessenkool wrote:
> Hi!
> 
> On Tue, Jul 07, 2020 at 04:39:58PM +0800, luoxhu wrote:
>>> Lots of questions, sorry!
>>
>> Thanks for the nice suggestions of the initial patch contains many issues:),
> 
> Pretty much all of it should *work*, it just can be improved and
> simplified quite a bit :-)
> 
>> For this case, %1:SF matches with "=wa"?  And how to construct cases to
>> match("=?r", "wa") and ("=!r", "r") combinations, please?
> 
> operands[0], not operands[1]?
> 
> Simple testcases will not put the output into a GPR, unless you force
> the compiler to do that, because of the ? and !.
> 
> Often you can just do
> 
>asm("#" : "+r"(x));
> 
> to force "x" into a GPR at that point of the program.  But there is
> nothing stopping the compiler from copying it back to a VSR where it
> thinks that is cheaper ;-)
> 
> So maybe this pattern should just have the GPR-to-VSR alternative?  It
> does not look like the GPR destination variants are useful?
> 
>> +  rtx op0 = operands[0];
>> +  rtx op1 = operands[1];
>> +  rtx op2 = operands[2];
> 
> (Please just write out operands[N] everywhere).
> 
>> +  if (GET_CODE (operands[2]) == SCRATCH)
>> +op2 = gen_reg_rtx (DImode);
>> +
>> +  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << 32);
>> +  emit_insn (gen_anddi3 (op2, op1, mask));
> 
> Groovy :-)
> 
> So, it looks like you can remove the ? and ! alternatives, leaving just
> the first alternative?
> 

Thanks.

V3 Update: Leave only GPR-to-VSR alternative and use operands[N].
Bootstrap and regression tested pass on Power8-LE.


For extracting high part element from DImode register like:

{%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}

split it before reload with "and mask" to avoid generating shift right
32 bit then shift left 32 bit.  This pattern also exists in PR42475 and
PR67741, etc.

srdi 3,3,32
sldi 9,3,32
mtvsrd 1,9
xscvspdpn 1,1

=>

rldicr 3,3,0,31
mtvsrd 1,3
xscvspdpn 1,1

gcc/ChangeLog:

2020-07-08  Xionghu Luo  

PR rtl-optimization/89310
* config/rs6000/rs6000.md (movsf_from_si2): New
define_insn_and_split.

gcc/testsuite/ChangeLog:

2020-07-08  Xionghu Luo  

PR rtl-optimization/89310
* gcc.target/powerpc/pr89310.c: New test.
---
 gcc/config/rs6000/rs6000.md| 34 ++
 gcc/testsuite/gcc.target/powerpc/pr89310.c | 17 +++
 2 files changed, 51 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr89310.c

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 4fcd6a94022..02c5171378c 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -7593,6 +7593,40 @@ (define_insn_and_split "movsf_from_si"
"*,  *, p9v,   p8v,   *, *,
 p8v,p8v,   p8v,   *")])
 
+;; For extracting high part element from DImode register like:
+;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
+;; split it before reload with "and mask" to avoid generating shift right
+;; 32 bit then shift left 32 bit.
+(define_insn_and_split "movsf_from_si2"
+  [(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
+   (unspec:SF [
+(subreg:SI (ashiftrt:DI
+  (match_operand:DI 1 "input_operand" "r")
+  (const_int 32))
+ 0)]
+UNSPEC_SF_FROM_SI))
+   (clobber (match_scratch:DI 2 "=r"))]
+  "TARGET_NO_SF_SUBREG"
+  "@
+  #"
+
+  "&& !reload_completed
+   && vsx_reg_sfsubreg_ok (operands[0], SFmode)"
+  [(const_int 0)]
+{
+  if (GET_CODE (operands[2]) == SCRATCH)
+operands[2] = gen_reg_rtx (DImode);
+
+  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << 32);
+  emit_insn (gen_anddi3 (operands[2], operands[1], mask));
+  emit_insn (gen_p8_mtvsrd_sf (operands[0], operands[2]));
+  emit_insn (gen_vsx_xscvspdpn_directmove (operands[0], operands[0]));
+  DONE;
+}
+  [(set_attr "length" "12")
+  (set_attr "type" "vecfloat")
+  (set_attr "isa" "p8v")])
+
 
 ;; Move 64-bit binary/decimal floating point
 (define_expand "mov"
diff --git a/gcc/testsuite/gcc.target/powerpc/pr89310.c 
b/gcc/testsuite/gcc.target/powerpc/pr89310.c
new file mode 100644
index 000..15e78509246
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr89310.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+struct s {
+  int i;
+  float f;
+};
+
+float
+foo (struct s arg)
+{
+  return arg.f;
+}
+
+/* { dg-final { scan-assembler-not {\msrdi\M} } } */
+/* { dg-final { scan-assembler-not {\msldi\M} } } */
+/* { dg-final { scan-assembler-times {\mrldicr\M} 1 } } */
-- 
2.21.0.777.g83232e3864



Re: [PATCH] rs6000: Split movsf_from_si from high word before reload[PR89310]

2020-07-07 Thread luoxhu via Gcc-patches



On 2020/7/7 08:18, Segher Boessenkool wrote:
> Hi!
> 
> On Sun, Jul 05, 2020 at 09:17:57PM -0500, Xionghu Luo wrote:
>> For extracting high part element from DImode register like:
>>
>> {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
>>
>> split it before reload with "and mask" to avoid generating shift right
>> 32 bit then shift left 32 bit.
>>
>> srdi 3,3,32
>> sldi 9,3,32
>> mtvsrd 1,9
>> xscvspdpn 1,1
>>
>> =>
>>
>> rldicr 3,3,0,31
>> mtvsrd 1,3
>> xscvspdpn 1,1
> 
> Great :-)
> 
>> +;; For extracting high part element from DImode register like:
>> +;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
>> +;; split it before reload with "and mask" to avoid generating shift right
>> +;; 32 bit then shift left 32 bit.
>> +(define_insn_and_split "movsf_from_si2"
>> +  [(set (match_operand:SF 0 "nonimmediate_operand"
>> +"=!r,   f, v, wa,m, Z,
>> + Z, wa,?r,!r")
>> +(unspec:SF [
>> + (subreg:SI (ashiftrt:DI
>> +   (match_operand:DI 1 "input_operand"
>> +   "m, m, wY,Z, r, f,
>> +   wa,r, wa,r")
>> +  (const_int 32)) 0)]
>> +   UNSPEC_SF_FROM_SI))
>> +  (clobber (match_scratch:DI 2
>> +"=X,X, X, X, X, X,
>> +X, r, X, X"))]
>> +  "TARGET_NO_SF_SUBREG
>> +   && (register_operand (operands[0], SFmode)
>> +   && register_operand (operands[1], DImode))"
> 
> If the insn condition requires operands 0 and 1 to be register_operands,
> it can ask for that in the predicates, instead: not nonimmediate_operand
> and input_operand, but just gpc_reg_operand instead.  You can leave out
> the impossible alternatives as well (0, 1, 2, 3, 4, 5, 6), leaving just
> 
> (define_insn_and_split "movsf_from_si2"
>[(set (match_operand:SF 0 "gpc_reg_operand" "=wa,?r,!r")
>   (unspec:SF
> [(subreg:SI (ashiftrt:DI
>   (match_operand:DI 1 "input_operand" "r,wa,r")
>   (const_int 32))
> 0)]
> UNSPEC_SF_FROM_SI)))]
>"TARGET_NO_SF_SUBREG"
>"@
> #
> mfvsrwz %0,%x1
> mr %0,%1"
> 
>"&& !reload_completed
> && vsx_reg_sfsubreg_ok (operands[0], SFmode)"
>[(const_int 0)]
> {
>rtx op0 = operands[0];
>rtx op1 = operands[1];
>rtx tmp = gen_reg_rtx (DImode);
> 
> You cannot call gen_reg_rtx too late in the pass pipeline.  What we
> usually do for such cases is put it as a match_scratch in the pattern,
> and then do code like
> 
>if (GET_CODE (operands[2]) == SCRATCH)
>  operands[2] = gen_reg_rtx (DImode);
> 
> so that it will work both before and after reload.
> 
>/* Avoid split {r155:SI#0=unspec[r133:DI>>0x20#0] 86;clobber scratch;} 
> from PR42745.  */
> 
> (This line is too long, btw.)
> 
>if (!SUBREG_P (operands[0]))
>  {
>rtx mask = GEN_INT (HOST_WIDE_INT_M1U << 32);
>emit_insn (gen_anddi3 (tmp, op1, mask));
>emit_insn (gen_p8_mtvsrd_sf (op0, tmp));
>emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
>DONE;
>  }
>else
>  FAIL;
> }
>[(set_attr "length" "12,*,*")
> (set_attr "type" "vecfloat,mffgpr,*")
> (set_attr "isa" "p8v,p8v,*")])
> 
> I wonder what will happen if you actually do FAIL here...  There then is
> no insn alternative that can match, so we ICE?  In that case, just leave
> out the whole FAIL thing, it is useless?  You can do a gcc_assert if you
> want to check something.
> 
> Oh, and maybe you only want to handle GPRs here, not VSRs?  So just the
> "r", not the "wa" at all?  What code would it generate for vector regs?
> 
> Lots of questions, sorry!

Thanks for the nice suggestions of the initial patch contains many issues:),

For this case, %1:SF matches with "=wa"?  And how to construct cases to
match("=?r", "wa") and ("=!r", "r") combinations, please?

Removed lots of copy-paste from "movsf_from_si" and update the patch with 
your comments:


For extracting high part element from DImode register like:

{%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}

split it before reload with "and mask" to avoid generating shift right
32 bit then shift left 32 bit.  This pattern also exists in PR42475 and
PR67741, etc.

srdi 3,3,32
sldi 9,3,32
mtvsrd 1,9
xscvspdpn 1,1

=>

rldicr 3,3,0,31
mtvsrd 1,3
xscvspdpn 1,1

Bootstrap and regression tested pass on Power8-LE.

gcc/ChangeLog:

2020-07-07  Xionghu Luo  

PR rtl-optimization/89310
* config/rs6000/rs6000.md (movsf_from_si2): New
define_insn_and_split.

gcc/testsuite/ChangeLog:

2020-07-07  Xionghu Luo  

PR rtl-optimization/89310
* gcc.target/powerpc/pr89310.c: New test.
---
 gcc/config/rs6000/rs6000.md| 40 ++
 gcc/testsuite/gcc.target/powerpc/pr89310.c | 17 +
 2 files changed, 57 insertions(+)
 create mode 100644 

Ping^1 : [PATCH] [stage1] ipa-cp: Fix PGO regression caused by r278808

2020-06-15 Thread luoxhu via Gcc-patches

Gentle ping...


On 2020/6/1 09:45, Xionghu Luo wrote:

resend the patch for stage1:
https://gcc.gnu.org/pipermail/gcc-patches/2020-January/538186.html

The performance of exchange2 built with PGO will decrease ~28% by r278808
due to profile count set incorrectly.  The cloned nodes are updated to a
very small count caused later pass cunroll fail to unroll the recursive
function in exchange2.

digits_2 ->
digits_2.constprop.0, digits_2.constprop.1, etc.

1. Enable proportion orig_sum to the new nodes for self recursive node
(k means multiple self recursive calls):
new_sum = (orig_sum + new_sum) * 1 / k \
* self_recursive_probability * (1 / param_ipa_cp_max_recursive_depth).
2. Two mutually recursive functions are not supported in self recursive
clone yet so also not supported in update_profiling_info here.
3. Improve value range for param_ipa_cp_max_recursive_depth to (0, 8).
If it calls itself two different sets, usually recursive boudary limit
will stop the specialize first, otherwise it is slow even without
recursive specialize.

gcc/ChangeLog:

2020-05-29  Xionghu Luo  

* ipa-cp.c (update_profiling_info): Check self_scc node.
* params.opt (ipa-cp-max-recursive-depth): Set param range.

gcc/testsuite/ChangeLog:

2020-05-29  Xionghu Luo  

* gcc.dg/ipa/ipa-clone-3.c: Update count.
---
  gcc/ipa-cp.c   | 33 +-
  gcc/params.opt |  2 +-
  gcc/testsuite/gcc.dg/ipa/ipa-clone-3.c |  2 +-
  3 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index c64e9104a94..919c741ecfa 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -2046,7 +2046,7 @@ propagate_vals_across_arith_jfunc (cgraph_edge *cs,
/* Recursively generate lattice values with a limited count.  */
FOR_EACH_VEC_ELT (val_seeds, i, src_val)
{
- for (int j = 1; j < max_recursive_depth; j++)
+ for (int j = 0; j < max_recursive_depth; j++)
{
  tree cstval = get_val_across_arith_op (opcode, opnd1_type, opnd2,
 src_val, res_type);
@@ -4345,6 +4345,37 @@ update_profiling_info (struct cgraph_node *orig_node,
  false);
new_sum = stats.count_sum;
  
+  class ipa_node_params *info = IPA_NODE_REF (orig_node);

+  if (info && info->node_is_self_scc)
+{
+  profile_count self_recursive_count = profile_count::zero ();
+  unsigned k = 0;
+
+  /* The self recursive edge is on the orig_node.  */
+  for (cs = orig_node->callees; cs; cs = cs->next_callee)
+   if (ipa_edge_within_scc (cs))
+ {
+   cgraph_node *callee = cs->callee->function_symbol ();
+   if (callee && cs->caller == cs->callee)
+ {
+   self_recursive_count += cs->count;
+   k++;
+ }
+ }
+
+  /* Proportion count for multiple self recursive node.  */
+  if (k)
+   self_recursive_count.apply_scale (1, k);
+
+  /* Proportion count for self recursive node from all callers.  */
+  new_sum
+   = (orig_sum + new_sum).apply_scale (self_recursive_count, orig_sum);
+
+  /* Proportion count for specialized cloned node.  */
+  if (param_ipa_cp_max_recursive_depth)
+   new_sum = new_sum.apply_scale (1, param_ipa_cp_max_recursive_depth);
+}
+
if (orig_node_count < orig_sum + new_sum)
  {
if (dump_file)
diff --git a/gcc/params.opt b/gcc/params.opt
index 4aec480798b..ad000145dfb 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -199,7 +199,7 @@ Common Joined UInteger Var(param_ipa_cp_loop_hint_bonus) 
Init(64) Param Optimiza
  Compile-time bonus IPA-CP assigns to candidates which make loop bounds or 
strides known.
  
  -param=ipa-cp-max-recursive-depth=

-Common Joined UInteger Var(param_ipa_cp_max_recursive_depth) Init(8) Param 
Optimization
+Common Joined UInteger Var(param_ipa_cp_max_recursive_depth) Init(7) 
IntegerRange(0, 8) Param Optimization
  Maximum depth of recursive cloning for self-recursive function.
  
  -param=ipa-cp-min-recursive-probability=

diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-clone-3.c 
b/gcc/testsuite/gcc.dg/ipa/ipa-clone-3.c
index a73cb8b63fc..1efa23ae109 100644
--- a/gcc/testsuite/gcc.dg/ipa/ipa-clone-3.c
+++ b/gcc/testsuite/gcc.dg/ipa/ipa-clone-3.c
@@ -41,4 +41,4 @@ int main ()
return recur_fn ();
  }
  
-/* { dg-final { scan-ipa-dump-times "Creating a specialized node of recur_fn/\[0-9\]*\\." 8 "cp" } } */

+/* { dg-final { scan-ipa-dump-times "Creating a specialized node of 
recur_fn/\[0-9\]*\\." 9 "cp" } } */



Re: [PATCH] rs6000: Use REAL_TYPE to copy when block move array in structure[PR65421]

2020-06-08 Thread luoxhu via Gcc-patches
Hi,

On 2020/6/3 04:32, Segher Boessenkool wrote:
> Hi Xiong Hu,
> 
> On Tue, Jun 02, 2020 at 04:41:50AM -0500, Xionghu Luo wrote:
>> Double array in structure as function arguments or return value is accessed
>> by BLKmode, they are stored to stack and load from stack with redundant
>> conversion from DF->DI->DF.  This patch checks the homogeneous type and
>> use the actual element type to do block move to by pass the conversions.
> 
>> @@ -2733,6 +2734,7 @@ expand_block_move (rtx operands[], bool might_overlap)
>> rtx loads[MAX_MOVE_REG];
>> rtx stores[MAX_MOVE_REG];
>> int num_reg = 0;
>> +  machine_mode elt_mode = DImode;
>>   
>> /* If this is not a fixed size move, just call memcpy */
>> if (! constp)
>> @@ -2750,6 +2752,17 @@ expand_block_move (rtx operands[], bool might_overlap)
>> if (bytes > rs6000_block_move_inline_limit)
>>   return 0;
>>   
>> +  tree type = TREE_TYPE (MEM_EXPR (orig_dest));
> 
> Declare elt_mode here as well?
> 
>> +  if (TREE_CODE (type) == RECORD_TYPE
>> +  && rs6000_discover_homogeneous_aggregate (TYPE_MODE (type), type, 
>> NULL,
>> +NULL))
>> +{
>> +  tree field_type = TREE_TYPE (first_field (type));
>> +  if (field_type && TREE_CODE (field_type) == ARRAY_TYPE
>> +  && TREE_CODE (TREE_TYPE (field_type)) == REAL_TYPE)
>> +elt_mode = TYPE_MODE (TREE_TYPE (field_type));
>> +}
> 
> Homogeneous aggregates only exist in the ELFv2 ABI, while the problem
> here is the SP float things.  You also noticed (elsewhere) that if the
> struct contains (say) SI, SF, SI, SF, then this does not help.
> 
> Is there some better condition this could use, and maybe an expansion
> that works in more cases as well?
> 
> And, it would be lovely if generic code could expand to something better
> already (not expand to a block move at all, certainly not for something
> as tiny as this).

This pr65421 is array in structure, the assignment is just struct to struct
and won't be split by SROA to element assignment like struct contains 
or .

pr65421.c.236t.optimized:
foo (const struct A a)
{
  struct A D.2909;

   [local count: 1073741824]:
  D.2909 = a;  // struct to struct.
  return D.2909;
}

pr69143.c.234t.optimized:
blah1 (struct foo1 a)
{
  struct foo1 D.2909;
  float _1;
  float _2;

   [local count: 1073741824]:
  _1 = a.y;
  _2 = a.x;
  D.2909.x = _1;// element to element.
  D.2909.y = _2;// element to element.
  return D.2909;
}

So the expander will choose difference path to expand them...

For pr65421, the arguments and return value are accessed by BLKmode after 
gimplify,
since there is no IPA pass, it is never changed from pass gimple to expand.

In expander, the type conversion only happens on expand_assignment of "D.2909 = 
a;"
(arguments assigned to local variable, stack to stack, generated by 
expand_block_move,
insn #13~#20 as followed), the expand_function_start(insn #2~#9) load each 
element type
to be DF already, DF to DI conversion in insn #13~#20 cause the later RTL 
passes fail to
do forward propagation in 246r.fwprop1.  So my patch tries to use the actual 
type for
array in structure here.  If rs6000_discover_homogeneous_aggregate is not 
allowed to be
used here, how about expose and call rs6000_aggregate_candidate directly?  Not 
clear why
"Homogeneous aggregates only exist in the ELFv2 ABI" since double array in 
structure is a
common usage?

pr65421.c.238r.expand:
1: NOTE_INSN_DELETED
   11: NOTE_INSN_BASIC_BLOCK 2
2: r121:DF=%1:DF
3: r122:DF=%2:DF
4: r123:DF=%3:DF
5: r124:DF=%4:DF
6: [r112:DI+0x20]=r121:DF
7: [r112:DI+0x28]=r122:DF
8: [r112:DI+0x30]=r123:DF
9: [r112:DI+0x38]=r124:DF
   10: NOTE_INSN_FUNCTION_BEG
   13: r125:DI=[r112:DI+0x20]
   15: r126:DI=[r112:DI+0x28]
   17: r127:DI=[r112:DI+0x30]
   19: r128:DI=[r112:DI+0x38]
   14: [r112:DI]=r125:DI
   16: [r112:DI+0x8]=r126:DI
   18: [r112:DI+0x10]=r127:DI
   20: [r112:DI+0x18]=r128:DI
   21: r129:DF=[r112:DI]
   22: r130:DF=[r112:DI+0x8]
   23: r131:DF=[r112:DI+0x10]
   24: r132:DF=[r112:DI+0x18]
   25: r117:DF=r129:DF
   26: r118:DF=r130:DF
   27: r119:DF=r131:DF
   28: r120:DF=r132:DF
   32: %1:DF=r117:DF
   33: %2:DF=r118:DF
   34: %3:DF=r119:DF
   35: %4:DF=r120:DF
   36: use %1:DF
   37: use %2:DF
   38: use %3:DF
   39: use %4:DF

To bypass block move requires very generic code change, the BLK mode is 
determined very early
in gimple, remove BLKmode seems huge project in stor-layout.c\function.c\expr.c 
etc. and not sure
other targets like it, the ARM64 use OImode register to avoid BLKmode/stack 
operations, while
X86 expand to two TImode register assignment and pointer result return.

Or do you mean some workaround that don't call emit_block_move to fall in 
expand_block_move in
rs6000-string.c when expand_assignment of "D.2909 = a;" below?
rtx
store_expr (tree exp, rtx target, int call_param_p,
bool nontemporal, bool reverse)
{
...
  else if 

Re: [PATCH v2] Fold (add -1; zero_ext; add +1) operations to zero_ext when not overflow (PR37451, part of PR61837)

2020-05-13 Thread luoxhu via Gcc-patches
On 2020/5/13 02:24, Richard Sandiford wrote:
> luoxhu  writes:
>> +  /* Fold (add -1; zero_ext; add +1) operations to zero_ext. i.e:
>> +
>> + 73: r145:SI=r123:DI#0-0x1
>> + 74: r144:DI=zero_extend (r145:SI)
>> + 75: r143:DI=r144:DI+0x1
>> + ...
>> + 31: r135:CC=cmp (r123:DI,0)
>> + 72: {pc={(r143:DI!=0x1)?L70:pc};r143:DI=r143:DI-0x1;clobber
>> + scratch;clobber scratch;}
> 
> Minor, but it might be worth stubbing out the clobbers, since they're
> not really necessary to understand the comment:
> 
>72: {pc={(r143:DI!=0x1)?L70:pc};r143:DI=r143:DI-0x1;...}
> 
>> +
>> + r123:DI#0-0x1 is param count derived from loop->niter_expr equal to the
>> + loop iterations, if loop iterations expression doesn't overflow, then
>> + (zero_extend (r123:DI#0-1))+1 could be simplified to zero_extend only.
>> +   */
>> +  bool simplify_zext = false;
> 
> I think it'd be easier to follow if this was split out into
> a subroutine, rather than having the simplify_zext variable.
> 
>> +  rtx extop0 = XEXP (count, 0);
>> +  if (GET_CODE (count) == ZERO_EXTEND && GET_CODE (extop0) == PLUS)
> 
> This isn't valid: we can only do XEXP (count, 0) *after* checking
> for a ZERO_EXTEND.  (It'd be good to test the patch with
> --enable-checking=yes,extra,rtl , which hopefully would have
> caught this.)
> 
>> +{
>> +  rtx addop0 = XEXP (extop0, 0);
>> +  rtx addop1 = XEXP (extop0, 1);
>> +
>> +  int nonoverflow = 0;
>> +  unsigned int_mode
>> += GET_MODE_PRECISION (as_a GET_MODE (addop0));
> 
> Heh.  I wondered at first how on earth this compiled.  It looked like
> there was a missing "(...)" around the GET_MODE.  But of course,
> GET_MODE adds its own parentheses, so it all works out. :-)
> 
> Please add the "(...)" anyway though.  We shouldn't rely on that.
> 
> "int_mode" seems a bit of a confusing name, since it's actually a precision
> in bits rather than a mode.
> 
>> +  unsigned HOST_WIDE_INT int_mode_max
>> += (HOST_WIDE_INT_1U << (int_mode - 1) << 1) - 1;
>> +  if (get_max_loop_iterations (loop, )
>> +  && wi::ltu_p (iterations, int_mode_max))
> 
> You could use GET_MODE_MASK instead of int_mode_max here.
> 
> For extra safety, it would be good to add a HWI_COMPUTABLE_P test,
> to make sure that using HWIs is valid.
> 
>> +nonoverflow = 1;
>> +
>> +  if (nonoverflow
> 
> Having the nonoverflow variable doesn't seem necessary.  We could
> just fuse the two "if" conditions together.
> 
>> +  && CONST_SCALAR_INT_P (addop1)
>> +  && GET_MODE_PRECISION (mode) == int_mode * 2
> 
> This GET_MODE_PRECISION condition also shouldn't be necessary.
> If we can prove that the subtraction doesn't wrap, we can extend
> to any wider mode, not just to double the width.
> 
>> +  && addop1 == GEN_INT (-1))
> 
> This can just be:
> 
> addop1 == constm1_rtx
> 
> There's then no need for the CONST_SCALAR_INT_P check.
> 
> Thanks,
> Richard
> 

Thanks for all your great comments, addressed them all with below update,
"--enable-checking=yes,extra,rtl" did catch the ICE with performance penalty.


This "subtract/extend/add" existed for a long time and still annoying us
(PR37451, part of PR61837) when converting from 32bits to 64bits, as the ctr
register is used as 64bits on powerpc64, Andraw Pinski had a patch but
caused some issue and reverted by Joseph S. Myers(PR37451, PR37782).

Andraw:
http://gcc.gnu.org/ml/gcc-patches/2008-09/msg01070.html
http://gcc.gnu.org/ml/gcc-patches/2008-10/msg01321.html
Joseph:
https://gcc.gnu.org/legacy-ml/gcc-patches/2011-11/msg02405.html

We still can do the simplification from "subtract/zero_ext/add" to "zero_ext"
when loop iterations is known to be LT than MODE_MAX (only do simplify
when counter+0x1 NOT overflow).

Bootstrap and regression tested pass on Power8-LE.

gcc/ChangeLog

2020-05-14  Xiong Hu Luo  

PR rtl-optimization/37451, part of PR target/61837
* loop-doloop.c (doloop_simplify_count): New function.  Simplify
(add -1; zero_ext; add +1) to zero_ext when not wrapping.
(doloop_modify): Call doloop_simplify_count.

gcc/testsuite/ChangeLog

2020-05-14  Xiong Hu Luo  

PR rtl-optimization/37451, part of PR target/61837
* gcc.target/powerpc/doloop-2.c: New test.
---
 gcc/loop-doloop.c   | 38 -
 gcc/testsuite/gcc.target/powerpc/doloop-2.c | 29 
 2 files changed, 66 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/doloop-2.c

diff --git a/gcc/loop-doloop.c b/gcc/loop-doloop.c
index db6a014e43d..02282d45bd5 100644
--- a/gcc/loop-doloop.c
+++ b/gcc/loop-doloop.c
@@ -397,6 +397,42 @@ add_test (rtx cond, edge *e, basic_block dest)
   return true;
 }
 
+/* Fold (add -1; zero_ext; add +1) operations to zero_ext if not wrapping. i.e:
+
+   73: r145:SI=r123:DI#0-0x1
+   74: r144:DI=zero_extend (r145:SI)
+   75: 

[PATCH v2] Fold (add -1; zero_ext; add +1) operations to zero_ext when not overflow (PR37451, part of PR61837)

2020-05-12 Thread luoxhu via Gcc-patches
Minor refine of checking iterations nonoverflow and a testcase for stage 1.


This "subtract/extend/add" existed for a long time and still annoying us
(PR37451, part of PR61837) when converting from 32bits to 64bits, as the ctr
register is used as 64bits on powerpc64, Andraw Pinski had a patch but
caused some issue and reverted by Joseph S. Myers(PR37451, PR37782).

Andraw:
http://gcc.gnu.org/ml/gcc-patches/2008-09/msg01070.html
http://gcc.gnu.org/ml/gcc-patches/2008-10/msg01321.html
Joseph:
https://gcc.gnu.org/legacy-ml/gcc-patches/2011-11/msg02405.html

We can do the simplification from "subtract/extend/add" to only extend
when loop iterations is known to be LT than MODE_MAX-1(NOT do simplify
when counter+0x1 overflow).

Bootstrap and regression tested pass on Power8-LE.

gcc/ChangeLog

2020-05-12  Xiong Hu Luo  

PR rtl-optimization/37451, part of PR target/61837
* loop-doloop.c (doloop_modify): Simplify (add -1; zero_ext; add +1)
to zero_ext when not wrapping overflow.

gcc/testsuite/ChangeLog

2020-05-12  Xiong Hu Luo  

PR rtl-optimization/37451, part of PR target/61837
* gcc.target/powerpc/doloop-2.c: New test.
---
 gcc/loop-doloop.c   | 46 -
 gcc/testsuite/gcc.target/powerpc/doloop-2.c | 14 +++
 2 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/doloop-2.c

diff --git a/gcc/loop-doloop.c b/gcc/loop-doloop.c
index db6a014e43d..16372382a22 100644
--- a/gcc/loop-doloop.c
+++ b/gcc/loop-doloop.c
@@ -477,7 +477,51 @@ doloop_modify (class loop *loop, class niter_desc *desc,
 }
 
   if (increment_count)
-count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
+{
+  /* Fold (add -1; zero_ext; add +1) operations to zero_ext. i.e:
+
+73: r145:SI=r123:DI#0-0x1
+74: r144:DI=zero_extend (r145:SI)
+75: r143:DI=r144:DI+0x1
+...
+31: r135:CC=cmp (r123:DI,0)
+72: {pc={(r143:DI!=0x1)?L70:pc};r143:DI=r143:DI-0x1;clobber
+scratch;clobber scratch;}
+
+r123:DI#0-0x1 is param count derived from loop->niter_expr equal to the
+loop iterations, if loop iterations expression doesn't overflow, then
+(zero_extend (r123:DI#0-1))+1 could be simplified to zero_extend only.
+   */
+  bool simplify_zext = false;
+  rtx extop0 = XEXP (count, 0);
+  if (GET_CODE (count) == ZERO_EXTEND && GET_CODE (extop0) == PLUS)
+   {
+ rtx addop0 = XEXP (extop0, 0);
+ rtx addop1 = XEXP (extop0, 1);
+
+ int nonoverflow = 0;
+ unsigned int_mode
+   = GET_MODE_PRECISION (as_a GET_MODE (addop0));
+ unsigned HOST_WIDE_INT int_mode_max
+   = (HOST_WIDE_INT_1U << (int_mode - 1) << 1) - 1;
+ if (get_max_loop_iterations (loop, )
+ && wi::ltu_p (iterations, int_mode_max))
+   nonoverflow = 1;
+
+ if (nonoverflow
+ && CONST_SCALAR_INT_P (addop1)
+ && GET_MODE_PRECISION (mode) == int_mode * 2
+ && addop1 == GEN_INT (-1))
+   {
+ count = simplify_gen_unary (ZERO_EXTEND, mode, addop0,
+ GET_MODE (addop0));
+ simplify_zext = true;
+   }
+   }
+
+  if (!simplify_zext)
+   count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
+}
 
   /* Insert initialization of the count register into the loop header.  */
   start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/doloop-2.c 
b/gcc/testsuite/gcc.target/powerpc/doloop-2.c
new file mode 100644
index 000..dc8516bb0ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/doloop-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target powerpc*-*-* } } */
+/* { dg-options "-O2 -fno-unroll-loops" } */
+
+int f(int l, int *a)
+{
+int i;
+  for(i = 0;i < l; i++)
+   a[i] = i;
+return l;
+}
+
+/* { dg-final { scan-assembler-not "-1" } } */
+/* { dg-final { scan-assembler "bdnz" } } */
+/* { dg-final { scan-assembler-times "mtctr" 1 } } */
-- 
2.21.0.777.g83232e3864





Re: [PATCH v2] Add handling of MULT_EXPR/PLUS_EXPR for wrapping overflow in affine combination(PR83403)

2020-05-11 Thread luoxhu via Gcc-patches

在 2020-05-06 20:09,Richard Biener 写道:

On Thu, 30 Apr 2020, luoxhu wrote:

Update the patch with overflow check.  Bootstrap and regression tested 
PASS on Power8-LE.



Use determine_value_range to get value range info for fold convert 
expressions
with internal operation PLUS_EXPR/MINUS_EXPR/MULT_EXPR when not 
overflow on

wrapping overflow inner type.  i.e.:

(long unsigned int)((unsigned  int)n * 10 + 1)
=>
(long unsigned int)n * (long unsigned int)10 + (long unsigned int)1

With this patch for affine combination, load/store motion could detect
more address refs independency and promote some memory expressions to
registers within loop.

PS: Replace the previous "(T1)(X + CST) as (T1)X - (T1)(-CST))"
to "(T1)(X + CST) as (T1)X + (T1)(CST))" for wrapping overflow.


This is OK for trunk if bootstrapped / tested properl.



Bootstrap and regression tested pass on power8LE, committed to
r11-259-g0447929f11e6a3e1b076841712b90a8b6bc7d33a, is it necessary
to backport it to gcc-10?


Thanks,
Xionghu




Thanks,
Richard.


gcc/ChangeLog

2020-04-30  Xiong Hu Luo  

PR tree-optimization/83403
* tree-affine.c (expr_to_aff_combination): Replace SSA_NAME with
determine_value_range, Add fold conversion of MULT_EXPR, fix the
previous PLUS_EXPR.

gcc/testsuite/ChangeLog

2020-04-30  Xiong Hu Luo  

PR tree-optimization/83403
* gcc.dg/tree-ssa/pr83403-1.c: New test.
* gcc.dg/tree-ssa/pr83403-2.c: New test.
* gcc.dg/tree-ssa/pr83403.h: New header.
---
 gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c |  8 ++
 gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c |  8 ++
 gcc/testsuite/gcc.dg/tree-ssa/pr83403.h   | 30 
+++

 gcc/tree-affine.c | 24 ++
 4 files changed, 60 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr83403.h

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c

new file mode 100644
index 000..748375b03af
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
+
+#define TYPE unsigned int
+
+#include "pr83403.h"
+
+/* { dg-final { scan-tree-dump-times "Executing store motion of" 10 
"lim2" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c

new file mode 100644
index 000..ca2e6bbd61c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
+
+#define TYPE int
+
+#include "pr83403.h"
+
+/* { dg-final { scan-tree-dump-times "Executing store motion of" 10 
"lim2" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h 
b/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h

new file mode 100644
index 000..0da8a835b5f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h
@@ -0,0 +1,30 @@
+__attribute__ ((noinline)) void
+calculate (const double *__restrict__ A, const double *__restrict__ 
B,

+  double *__restrict__ C)
+{
+  TYPE m = 0;
+  TYPE n = 0;
+  TYPE k = 0;
+
+  A = (const double *) __builtin_assume_aligned (A, 16);
+  B = (const double *) __builtin_assume_aligned (B, 16);
+  C = (double *) __builtin_assume_aligned (C, 16);
+
+  for (n = 0; n < 9; n++)
+{
+  for (m = 0; m < 10; m++)
+   {
+ C[(n * 10) + m] = 0.0;
+   }
+
+  for (k = 0; k < 17; k++)
+   {
+#pragma simd
+ for (m = 0; m < 10; m++)
+   {
+ C[(n * 10) + m] += A[(k * 20) + m] * B[(n * 20) + k];
+   }
+   }
+}
+}
+
diff --git a/gcc/tree-affine.c b/gcc/tree-affine.c
index 0eb8db1b086..5620e6bf28f 100644
--- a/gcc/tree-affine.c
+++ b/gcc/tree-affine.c
@@ -343,24 +343,28 @@ expr_to_aff_combination (aff_tree *comb, 
tree_code code, tree type,

wide_int minv, maxv;
/* If inner type has wrapping overflow behavior, fold conversion
   for below case:
-(T1)(X - CST) -> (T1)X - (T1)CST
-	   if X - CST doesn't overflow by range information.  Also 
handle

-  (T1)(X + CST) as (T1)(X - (-CST)).  */
+(T1)(X *+- CST) -> (T1)X *+- (T1)CST
+  if X *+- CST doesn't overflow by range information.  */
if (TYPE_UNSIGNED (itype)
&& TYPE_OVERFLOW_WRAPS (itype)
-   && TREE_CODE (op0) == SSA_NAME
&& TREE_CODE (op1) == INTEGER_CST
-   && icode != MULT_EXPR
-   && get_range_info (op0, , ) == VR_RANGE)
+   && determine_value_range (op0, , ) == VR_RANGE)
  {
+   wi::overflow_type overflow = wi::OVF_NONE;
+   signop sign = UNSIGNED;

[PATCH v2] Add handling of MULT_EXPR/PLUS_EXPR for wrapping overflow in affine combination(PR83403)

2020-04-30 Thread luoxhu via Gcc-patches
Update the patch with overflow check.  Bootstrap and regression tested PASS on 
Power8-LE.


Use determine_value_range to get value range info for fold convert expressions
with internal operation PLUS_EXPR/MINUS_EXPR/MULT_EXPR when not overflow on
wrapping overflow inner type.  i.e.:

(long unsigned int)((unsigned  int)n * 10 + 1)
=>
(long unsigned int)n * (long unsigned int)10 + (long unsigned int)1

With this patch for affine combination, load/store motion could detect
more address refs independency and promote some memory expressions to
registers within loop.

PS: Replace the previous "(T1)(X + CST) as (T1)X - (T1)(-CST))"
to "(T1)(X + CST) as (T1)X + (T1)(CST))" for wrapping overflow.

gcc/ChangeLog

2020-04-30  Xiong Hu Luo  

PR tree-optimization/83403
* tree-affine.c (expr_to_aff_combination): Replace SSA_NAME with
determine_value_range, Add fold conversion of MULT_EXPR, fix the
previous PLUS_EXPR.

gcc/testsuite/ChangeLog

2020-04-30  Xiong Hu Luo  

PR tree-optimization/83403
* gcc.dg/tree-ssa/pr83403-1.c: New test.
* gcc.dg/tree-ssa/pr83403-2.c: New test.
* gcc.dg/tree-ssa/pr83403.h: New header.
---
 gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c |  8 ++
 gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c |  8 ++
 gcc/testsuite/gcc.dg/tree-ssa/pr83403.h   | 30 +++
 gcc/tree-affine.c | 24 ++
 4 files changed, 60 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr83403.h

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
new file mode 100644
index 000..748375b03af
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
+
+#define TYPE unsigned int
+
+#include "pr83403.h"
+
+/* { dg-final { scan-tree-dump-times "Executing store motion of" 10 "lim2" } } 
*/
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
new file mode 100644
index 000..ca2e6bbd61c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
+
+#define TYPE int
+
+#include "pr83403.h"
+
+/* { dg-final { scan-tree-dump-times "Executing store motion of" 10 "lim2" } } 
*/
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h 
b/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h
new file mode 100644
index 000..0da8a835b5f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h
@@ -0,0 +1,30 @@
+__attribute__ ((noinline)) void
+calculate (const double *__restrict__ A, const double *__restrict__ B,
+  double *__restrict__ C)
+{
+  TYPE m = 0;
+  TYPE n = 0;
+  TYPE k = 0;
+
+  A = (const double *) __builtin_assume_aligned (A, 16);
+  B = (const double *) __builtin_assume_aligned (B, 16);
+  C = (double *) __builtin_assume_aligned (C, 16);
+
+  for (n = 0; n < 9; n++)
+{
+  for (m = 0; m < 10; m++)
+   {
+ C[(n * 10) + m] = 0.0;
+   }
+
+  for (k = 0; k < 17; k++)
+   {
+#pragma simd
+ for (m = 0; m < 10; m++)
+   {
+ C[(n * 10) + m] += A[(k * 20) + m] * B[(n * 20) + k];
+   }
+   }
+}
+}
+
diff --git a/gcc/tree-affine.c b/gcc/tree-affine.c
index 0eb8db1b086..5620e6bf28f 100644
--- a/gcc/tree-affine.c
+++ b/gcc/tree-affine.c
@@ -343,24 +343,28 @@ expr_to_aff_combination (aff_tree *comb, tree_code code, 
tree type,
wide_int minv, maxv;
/* If inner type has wrapping overflow behavior, fold conversion
   for below case:
-(T1)(X - CST) -> (T1)X - (T1)CST
-  if X - CST doesn't overflow by range information.  Also handle
-  (T1)(X + CST) as (T1)(X - (-CST)).  */
+(T1)(X *+- CST) -> (T1)X *+- (T1)CST
+  if X *+- CST doesn't overflow by range information.  */
if (TYPE_UNSIGNED (itype)
&& TYPE_OVERFLOW_WRAPS (itype)
-   && TREE_CODE (op0) == SSA_NAME
&& TREE_CODE (op1) == INTEGER_CST
-   && icode != MULT_EXPR
-   && get_range_info (op0, , ) == VR_RANGE)
+   && determine_value_range (op0, , ) == VR_RANGE)
  {
+   wi::overflow_type overflow = wi::OVF_NONE;
+   signop sign = UNSIGNED;
if (icode == PLUS_EXPR)
- op1 = wide_int_to_tree (itype, -wi::to_wide (op1));
-   if (wi::geu_p (minv, wi::to_wide (op1)))
+ wi::add (maxv, wi::to_wide (op1), sign, );
+   else if (icode == MULT_EXPR)
+ wi::mul (maxv, wi::to_wide (op1), sign, );
+ 

Re: [PATCH] Add value range info for affine combination to improve store motion (PR83403)

2020-04-29 Thread luoxhu via Gcc-patches



On 2020/4/28 18:30, Richard Biener wrote:

> 
> OK, I guess instead of get_range_info expr_to_aff_combination could
> simply use determine_value_range (op0, , ) == VR_RANGE
> (the && TREE_CODE (op0) == SSA_NAME check can then be removed)?
> 

Tried with determine_value_range, it works and is much more straight forward,
Thanks for your suggestion, update the patch as below with some tests, 
regression testing.


[PATCH] Add handling of MULT_EXPR/PLUS_EXPR for wrapping overflow in affine 
combination(PR83403)

Use determine_value_range to get value range info for fold convert expressions
with internal operation PLUS_EXPR/MINUS_EXPR/MULT_EXPR when not overflow on
wrapping overflow inner type.  i.e.:

(long unsigned int)((unsigned  int)n * 10 + 1)
=>
(long unsigned  int)n * (long unsigned int)10 + (long unsigned int)1

With this patch for affine combination, load/store motion could detect
more address refs independency and promote some memory expressions to
registers within loop.

PS: Replace the previous "(T1)(X + CST) as (T1)X - (T1)(-CST))"
to "(T1)(X + CST) as (T1)X + (T1)(CST))" for wrapping overflow.

gcc/ChangeLog

2020-04-29  Xiong Hu Luo  

PR tree-optimization/83403
* tree-affine.c (expr_to_aff_combination): Replace SSA_NAME with
determine_value_range, Add fold conversion of MULT_EXPR, fix the
previous PLUS_EXPR.

gcc/testsuite/ChangeLog

2020-04-29  Xiong Hu Luo  

PR tree-optimization/83403
* gcc.dg/tree-ssa/pr83403-1.c: New test.
* gcc.dg/tree-ssa/pr83403-2.c: New test.
* gcc.dg/tree-ssa/pr83403.h: New header.
---
 gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c |  8 ++
 gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c |  8 ++
 gcc/testsuite/gcc.dg/tree-ssa/pr83403.h   | 30 +++
 gcc/tree-affine.c | 22 +
 4 files changed, 58 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr83403.h

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
new file mode 100644
index 000..748375b03af
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
+
+#define TYPE unsigned int
+
+#include "pr83403.h"
+
+/* { dg-final { scan-tree-dump-times "Executing store motion of" 10 "lim2" } } 
*/
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
new file mode 100644
index 000..ca2e6bbd61c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
+
+#define TYPE int
+
+#include "pr83403.h"
+
+/* { dg-final { scan-tree-dump-times "Executing store motion of" 10 "lim2" } } 
*/
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h 
b/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h
new file mode 100644
index 000..0da8a835b5f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403.h
@@ -0,0 +1,30 @@
+__attribute__ ((noinline)) void
+calculate (const double *__restrict__ A, const double *__restrict__ B,
+  double *__restrict__ C)
+{
+  TYPE m = 0;
+  TYPE n = 0;
+  TYPE k = 0;
+
+  A = (const double *) __builtin_assume_aligned (A, 16);
+  B = (const double *) __builtin_assume_aligned (B, 16);
+  C = (double *) __builtin_assume_aligned (C, 16);
+
+  for (n = 0; n < 9; n++)
+{
+  for (m = 0; m < 10; m++)
+   {
+ C[(n * 10) + m] = 0.0;
+   }
+
+  for (k = 0; k < 17; k++)
+   {
+#pragma simd
+ for (m = 0; m < 10; m++)
+   {
+ C[(n * 10) + m] += A[(k * 20) + m] * B[(n * 20) + k];
+   }
+   }
+}
+}
+
diff --git a/gcc/tree-affine.c b/gcc/tree-affine.c
index 0eb8db1b086..86bb64fe245 100644
--- a/gcc/tree-affine.c
+++ b/gcc/tree-affine.c
@@ -343,23 +343,25 @@ expr_to_aff_combination (aff_tree *comb, tree_code code, 
tree type,
wide_int minv, maxv;
/* If inner type has wrapping overflow behavior, fold conversion
   for below case:
-(T1)(X - CST) -> (T1)X - (T1)CST
-  if X - CST doesn't overflow by range information.  Also handle
-  (T1)(X + CST) as (T1)(X - (-CST)).  */
+(T1)(X *+- CST) -> (T1)X *+- (T1)CST
+  if X *+- CST doesn't overflow by range information.  */
if (TYPE_UNSIGNED (itype)
&& TYPE_OVERFLOW_WRAPS (itype)
-   && TREE_CODE (op0) == SSA_NAME
&& TREE_CODE (op1) == INTEGER_CST
-   && icode != MULT_EXPR
-   && get_range_info (op0, , ) == VR_RANGE)
+   && determine_value_range (op0, , ) == VR_RANGE)
  {
-   if 

Re: [PATCH] Add value range info for affine combination to improve store motion (PR83403)

2020-04-28 Thread luoxhu via Gcc-patches



On 2020/4/28 15:01, Richard Biener wrote:
> On Tue, 28 Apr 2020, Xionghu Luo wrote:
> 
>> From: Xionghu Luo 
>>
>> Get and propagate value range info to convert expressions with convert
>> operation on PLUS_EXPR/MINUS_EXPR/MULT_EXPR when not overflow.  i.e.:
>>
>> (long unsigned int)((unsigned  int)n * 10 + 1)
>> =>
>> (long unsigned int)((unsigned  int) n * (long unsigned int)10 + (long 
>> unsigned int)1)
>>
>> With this patch for affine combination, load/store motion could detect
>> more address refs independency and promote some memory expressions to
>> registers within loop.
>>
>> PS: Replace the previous "(T1)(X + CST) as (T1)X - (T1)(-CST))"
>> to "(T1)(X + CST) as (T1)X + (T1)(CST))" for wrapping overflow.
>>
>> Bootstrap and regression tested pass on Power8-LE.  Any comments?
>> Thanks.
> 
> So the core of the patch is adding handling of MULT_EXPR and the rest
> is a no-op?  It's not clear from the patch what passing down a
> value-range does and your description doesn't say anything about this
> either.

This patch handles MULT_EXPR first, and also handles (long unsigned int)(n_93 * 
10 + 1) as
"n_93*10" is not a ssa_name by using the value_range passed down, minv 
[1,81] is the 
range info of "n_93 * 10 +1", so wi::leu_p (maxv, wi::to_wide (TYPE_MAX_VALUE 
(itype)) in
expr_to_aff_combination could ensure that no wrapping overflow of this 
converting.  Not sure
if it's better described now...  And some debug message as follows:

131t.lim2:
# n_93 = PHI <0(2), n_36(7)>
_118 = n_93 * 10;
_120 = (long unsigned int) _118;
_121 = _120 * 8;
_122 = [0][0] + _121;
*_122 = 0.0;
_128 = _118 + 1;
_129 = (long unsigned int) _128;
_130 = _129 * 8;
_131 = [0][0] + _130;
*_131 = 0.0;
_137 = _118 + 2;
_138 = (long unsigned int) _137;
_139 = _138 * 8;
_140 = [0][0] + _139;
*_140 = 0.0;


Breakpoint 28, expr_to_aff_combination (comb=0x7fffc068, code=NOP_EXPR, 
type=, op0=, op1=, 
vr=0x7fffc490) at ../../gcc-mast
er/gcc/tree-affine.c:350
92: itype = 
93: otype = 
94: icode = PLUS_EXPR
95: code = NOP_EXPR
(gdb) ptree op0
 
unit-size 
align:32 warn_if_not_align:0 symtab:0 alias-set -1 canonical-type 
0x753c0690 precision:32 min <
integer_cst 0x752f1230 0> max  
context 
pointer_to_this >

arg:0 
visited var 
def_stmt n_93 = PHI <0(2), n_36(7)>
version:93
ptr-info 0x7530b680>
arg:1  constant 10>>
(gdb) ptree op1
  
constant 1>
(gdb) p minv
$580 = {
   = {
val = {1, 0, 140737310886240},
len = 1,
precision = 32
  },
  members of generic_wide_int:
  static is_sign_extended = true
}
(gdb) p maxv
$581 = {
   = {
val = {81, 65, 40},
len = 1,
precision = 32
  },
  members of generic_wide_int:
  static is_sign_extended = true
}


Thanks,
Xionghu

> 
> Richard.
> 
>> gcc/ChangeLog
>>
>>  2020-04-28  Xiong Hu Luo  
>>
>>  PR tree-optimization/83403
>>  * tree-affine.c (aff_combination_convert): New parameter
>>  value_range.
>>  (expr_to_aff_combination): Use range info to check CONVERT
>>  expr on PLUS_EXPR/MINUS_EXPR/MULT_EXPR.
>>  (tree_to_aff_combination): Likewise.
>>  (aff_combination_expand): Get and propagate range info.
>>  * tree-affine.h: Include value-range.h.
>>  (aff_combination_convert): New parameter value_range.
>>  (tree_to_aff_combination): Likewise.
>>  (aff_combination_expand): Likewise.
>> ---
>>   gcc/tree-affine.c | 66 +++
>>   gcc/tree-affine.h |  8 +++---
>>   2 files changed, 43 insertions(+), 31 deletions(-)
>>
>> diff --git a/gcc/tree-affine.c b/gcc/tree-affine.c
>> index 0eb8db1b086..63f8acd4c73 100644
>> --- a/gcc/tree-affine.c
>> +++ b/gcc/tree-affine.c
>> @@ -220,7 +220,7 @@ aff_combination_add (aff_tree *comb1, aff_tree *comb2)
>>   /* Converts affine combination COMB to TYPE.  */
>>   
>>   void
>> -aff_combination_convert (aff_tree *comb, tree type)
>> +aff_combination_convert (aff_tree *comb, tree type, value_range *vr)
>>   {
>> unsigned i, j;
>> tree comb_type = comb->type;
>> @@ -228,7 +228,7 @@ aff_combination_convert (aff_tree *comb, tree type)
>> if  (TYPE_PRECISION (type) > TYPE_PRECISION (comb_type))
>>   {
>> tree val = fold_convert (type, aff_combination_to_tree (comb));
>> -  tree_to_aff_combination (val, type, comb);
>> +  tree_to_aff_combination (val, type, comb, vr);
>> return;
>>   }
>>   
>> @@ -263,8 +263,8 @@ aff_combination_convert (aff_tree *comb, tree type)
>>  true when that was successful and returns the combination in COMB.  */
>>   
>>   static bool
>> -expr_to_aff_combination (aff_tree *comb, tree_code code, tree type,
>> - tree op0, tree op1 = NULL_TREE)
>> +expr_to_aff_combination (aff_tree *comb, tree_code code, tree type, tree 
>> op0,
>> + tree op1 = NULL_TREE, value_range *vr = NULL)
>>   {
>> aff_tree tmp;
>> poly_int64 bitpos, bitsize, bytepos;
>> @@ -279,8 +279,8 

Re: [PATCH] Fold (add -1; zero_ext; add +1) operations to zero_ext when not zero (PR37451, PR61837)

2020-04-20 Thread luoxhu via Gcc-patches

Tiny update to accommodate unsigned int compare.

On 2020/4/20 16:21, luoxhu via Gcc-patches wrote:

Hi,

On 2020/4/18 00:32, Segher Boessenkool wrote:

On Thu, Apr 16, 2020 at 08:21:40PM -0500, Segher Boessenkool wrote:

On Wed, Apr 15, 2020 at 10:18:16AM +0100, Richard Sandiford wrote:

luoxhu--- via Gcc-patches  writes:

-count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
+{
+  /* Fold (add -1; zero_ext; add +1) operations to zero_ext based on addop0
+is never zero, as gimple pass loop ch will do optimization to simplify
+the loop to NO loop for loop condition is false.  */


IMO the code needs to prove this, rather than just assume that previous
passes have made it so.


Well, it should gcc_assert it, probably.

It is the left-hand side of a+b...  it cannot be 0, because niter always
is simplified!


Scratch that...  it cannot be *constant* 0, but that isn't the issue here.


Sorry that my comments in the code is a bit misleading, it is actually not
related to loop-ch at all.  The instruction sequence at 255r.loop2_invariant:

25: NOTE_INSN_BASIC_BLOCK 5
26: r133:SI=r123:DI#0-0x1
   REG_DEAD r123:DI
27: r123:DI=zero_extend(r133:SI)
   REG_DEAD r133:SI
28: r124:DI=r124:DI+0x4
30: r134:CC=cmp(r123:DI,0)
31: pc={(r134:CC!=0)?L69:pc}

And 257r.loop2_doloop (inserted #72,#73,#74, and #31 is replaced by #71):

;; Determined upper bound -1.
Loop 2 is simple:
   simple exit 6 -> 7
   number of iterations: (plus:SI (subreg:SI (reg:DI 123 [ doloop.6 ]) 0)
 (const_int -1 [0x]))
   upper bound: 2147483646
   likely upper bound: 2147483646
   realistic bound: -1
...
72: r144:SI=r123:DI#0-0x1
73: r143:DI=zero_extend(r144:SI)
74: r142:DI=r143:DI+0x1
...
25: NOTE_INSN_BASIC_BLOCK 5
26: r133:SI=r123:DI#0-0x1
   REG_DEAD r123:DI
27: r123:DI=zero_extend(r133:SI)
   REG_DEAD r133:SI
28: r124:DI=r124:DI+0x4
30: r134:CC=cmp(r123:DI,0)
71: {pc={(r142:DI!=0x1)?L69:pc};r142:DI=r142:DI-0x1;clobber scratch;clobber 
scratch;}

increment_count is true ensures the (condition NE const1_rtx), r123:DI#0-0x1 is 
the loop number
of iterations in doloop, it is definitely >= 0, and r123:DI#0 also shouldn't be 
zero as the
loop upper bound is 2147483646(0x7fffe)???

Since this simplification is in doloop-modify,  there is already some doloop 
form check like
!desc->simple_p || desc->assumptions|| desc->infinite in doloop_valid_p, so it 
seems
not necessary to repeat check it here again?
Maybe we just need check the loop upper bound is LEU than 0x7fffe to avoid 
if
instruction #26 overflow?

0xfffe




Updated patch, thanks:


This "subtract/extend/add" existed for a long time and still annoying us
(PR37451, PR61837) when converting from 32bits to 64bits, as the ctr
register is used as 64bits on powerpc64, Andraw Pinski had a patch but
caused some issue and reverted by Joseph S. Myers(PR37451, PR37782).

Andraw:
http://gcc.gnu.org/ml/gcc-patches/2008-09/msg01070.html
http://gcc.gnu.org/ml/gcc-patches/2008-10/msg01321.html
Joseph:
https://gcc.gnu.org/legacy-ml/gcc-patches/2011-11/msg02405.html

However, the doloop code improved a lot since so many years passed,
gcc.c-torture/execute/doloop-1.c is no longer a simple loop with constant
desc->niter_expr since r125:SI#0 is not SImode, so it is not a valid doloop
and no transform done in doloop again.  Thus we can do the simplification
from "subtract/extend/add" to only extend when loop upper_bound is known
to be LE than SINT_MAX-1(not do simplify when r120:DI#0-0x1 overflow).

UINT_MAX-1



Bootstrap and regression tested pass on Power8-LE.

gcc/ChangeLog

2020-04-20  Xiong Hu Luo  

PR rtl-optimization/37451, PR target/61837
* loop-doloop.c (doloop_modify): Simplify (add -1; zero_ext; add +1)
to zero_ext.
---
  gcc/loop-doloop.c | 41 -
  1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/gcc/loop-doloop.c b/gcc/loop-doloop.c
index db6a014e43d..da537aff60f 100644
--- a/gcc/loop-doloop.c
+++ b/gcc/loop-doloop.c
@@ -477,7 +477,46 @@ doloop_modify (class loop *loop, class niter_desc *desc,
  }

if (increment_count)
-count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
+{
+  /* Fold (add -1; zero_ext; add +1) operations to zero_ext. i.e:
+
+73: r145:SI=r123:DI#0-0x1
+74: r144:DI=zero_extend(r145:SI)
+75: r143:DI=r144:DI+0x1
+...
+31: r135:CC=cmp(r123:DI,0)
+72: {pc={(r143:DI!=0x1)?L70:pc};r143:DI=r143:DI-0x1;clobber
+scratch;clobber scratch;}
+
+r123:DI#0-0x1 is the loop iterations be GE than 0, r143 is the loop
+count be saved to ctr, if this loop's upper bound is known, r123:DI#0
+won't be zero, then the expressions could be simplified to zero_extend
+only.  */
+  bool simplify_zext = false;
+  rtx ext

Re: [PATCH] Fold (add -1; zero_ext; add +1) operations to zero_ext when not zero (PR37451, PR61837)

2020-04-20 Thread luoxhu via Gcc-patches
Hi,

On 2020/4/18 00:32, Segher Boessenkool wrote:
> On Thu, Apr 16, 2020 at 08:21:40PM -0500, Segher Boessenkool wrote:
>> On Wed, Apr 15, 2020 at 10:18:16AM +0100, Richard Sandiford wrote:
>>> luoxhu--- via Gcc-patches  writes:
>>>> -count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
>>>> +{
>>>> +  /* Fold (add -1; zero_ext; add +1) operations to zero_ext based on 
>>>> addop0
>>>> +   is never zero, as gimple pass loop ch will do optimization to simplify
>>>> +   the loop to NO loop for loop condition is false.  */
>>>
>>> IMO the code needs to prove this, rather than just assume that previous
>>> passes have made it so.
>>
>> Well, it should gcc_assert it, probably.
>>
>> It is the left-hand side of a+b...  it cannot be 0, because niter always
>> is simplified!
> 
> Scratch that...  it cannot be *constant* 0, but that isn't the issue here.

Sorry that my comments in the code is a bit misleading, it is actually not
related to loop-ch at all.  The instruction sequence at 255r.loop2_invariant:

   25: NOTE_INSN_BASIC_BLOCK 5
   26: r133:SI=r123:DI#0-0x1
  REG_DEAD r123:DI
   27: r123:DI=zero_extend(r133:SI)
  REG_DEAD r133:SI
   28: r124:DI=r124:DI+0x4
   30: r134:CC=cmp(r123:DI,0)
   31: pc={(r134:CC!=0)?L69:pc}

And 257r.loop2_doloop (inserted #72,#73,#74, and #31 is replaced by #71):   

;; Determined upper bound -1.
Loop 2 is simple:
  simple exit 6 -> 7
  number of iterations: (plus:SI (subreg:SI (reg:DI 123 [ doloop.6 ]) 0)
(const_int -1 [0x]))
  upper bound: 2147483646
  likely upper bound: 2147483646
  realistic bound: -1
...
   72: r144:SI=r123:DI#0-0x1
   73: r143:DI=zero_extend(r144:SI)
   74: r142:DI=r143:DI+0x1
...
   25: NOTE_INSN_BASIC_BLOCK 5
   26: r133:SI=r123:DI#0-0x1
  REG_DEAD r123:DI
   27: r123:DI=zero_extend(r133:SI)
  REG_DEAD r133:SI
   28: r124:DI=r124:DI+0x4
   30: r134:CC=cmp(r123:DI,0)
   71: {pc={(r142:DI!=0x1)?L69:pc};r142:DI=r142:DI-0x1;clobber scratch;clobber 
scratch;}

increment_count is true ensures the (condition NE const1_rtx), r123:DI#0-0x1 is 
the loop number
of iterations in doloop, it is definitely >= 0, and r123:DI#0 also shouldn't be 
zero as the
loop upper bound is 2147483646(0x7fffe)???

Since this simplification is in doloop-modify,  there is already some doloop 
form check like
!desc->simple_p || desc->assumptions|| desc->infinite in doloop_valid_p, so it 
seems
not necessary to repeat check it here again? 
Maybe we just need check the loop upper bound is LEU than 0x7fffe to avoid 
if
instruction #26 overflow?


Updated patch, thanks:


This "subtract/extend/add" existed for a long time and still annoying us
(PR37451, PR61837) when converting from 32bits to 64bits, as the ctr
register is used as 64bits on powerpc64, Andraw Pinski had a patch but
caused some issue and reverted by Joseph S. Myers(PR37451, PR37782).

Andraw:
http://gcc.gnu.org/ml/gcc-patches/2008-09/msg01070.html
http://gcc.gnu.org/ml/gcc-patches/2008-10/msg01321.html
Joseph:
https://gcc.gnu.org/legacy-ml/gcc-patches/2011-11/msg02405.html

However, the doloop code improved a lot since so many years passed,
gcc.c-torture/execute/doloop-1.c is no longer a simple loop with constant
desc->niter_expr since r125:SI#0 is not SImode, so it is not a valid doloop
and no transform done in doloop again.  Thus we can do the simplification
from "subtract/extend/add" to only extend when loop upper_bound is known
to be LE than SINT_MAX-1(not do simplify when r120:DI#0-0x1 overflow).

Bootstrap and regression tested pass on Power8-LE.

gcc/ChangeLog

2020-04-20  Xiong Hu Luo  

PR rtl-optimization/37451, PR target/61837
* loop-doloop.c (doloop_modify): Simplify (add -1; zero_ext; add +1)
to zero_ext.
---
 gcc/loop-doloop.c | 41 -
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/gcc/loop-doloop.c b/gcc/loop-doloop.c
index db6a014e43d..da537aff60f 100644
--- a/gcc/loop-doloop.c
+++ b/gcc/loop-doloop.c
@@ -477,7 +477,46 @@ doloop_modify (class loop *loop, class niter_desc *desc,
 }
 
   if (increment_count)
-count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
+{
+  /* Fold (add -1; zero_ext; add +1) operations to zero_ext. i.e:
+
+73: r145:SI=r123:DI#0-0x1
+74: r144:DI=zero_extend(r145:SI)
+75: r143:DI=r144:DI+0x1
+...
+31: r135:CC=cmp(r123:DI,0)
+72: {pc={(r143:DI!=0x1)?L70:pc};r143:DI=r143:DI-0x1;clobber
+scratch;clobber scratch;}
+
+r123:DI#0-0x1 is the loop iterations be GE than 0, r143 is the loop
+count be saved to ctr, if this loop's upper bound is known, r123:DI#0
+won't be zero, then the expressions could be simplified to zero_extend
+only.  */
+   

Re: [PATCH v2] rs6000: Don't use HARD_FRAME_POINTER_REGNUM if it's not live in pro_and_epilogue (PR91518)

2020-04-16 Thread luoxhu via Gcc-patches



On 2020/4/17 08:52, Segher Boessenkool wrote:
> Hi!
> 
> On Mon, Apr 13, 2020 at 10:11:43AM +0800, luoxhu wrote:
>> frame_pointer_needed is set to true in reload pass setup_can_eliminate,
>> but regs_ever_live[31] is false, pro_and_epilogue uses it without live
>> check causing CPU2006 465.tonto segment fault of loading from invalid
>> addresses due to r31 not saved/restored.  Thus, add HARD_FRAME_POINTER_REGNUM
>> live check with frame_pointer_needed_indeed_p when generating 
>> pro_and_epilogue
>> instructions.
> 
> I see.
> 
> Can you instead make a boolean variable "frame_pointer_needed_indeed",
> that you set somewhere early in *logue processing?  So that we can be
> sure that it will not change behind our backs.


Thanks, rs6000_emit_prologue seems the proper place to set the 
frame_pointer_needed_indeed,
but it's strange that hard_frame_pointer_rtx will be marked USE in 
make_prologue_seq, also
need check here though not causing segfault? PS, this piece of code is in 
different file.

function.c 
static rtx_insn *
make_prologue_seq (void)
{
  if (!targetm.have_prologue ())
return NULL;

  start_sequence ();
  rtx_insn *seq = targetm.gen_prologue ();
  emit_insn (seq);

  /* Insert an explicit USE for the frame pointer
 if the profiling is on and the frame pointer is required.  */
  if (crtl->profile && frame_pointer_needed)
emit_use (hard_frame_pointer_rtx);
...



Any way, update the patch as below with your previous comments:



This bug is exposed by FRE refactor of r263875.  Comparing the fre
dump file shows no obvious change of the segment fault function proves
it to be a target issue.
frame_pointer_needed is set to true in reload pass setup_can_eliminate,
but regs_ever_live[31] is false, pro_and_epilogue uses it without live
check causing CPU2006 465.tonto segment fault of loading from invalid
addresses due to r31 not saved/restored.  Thus, add HARD_FRAME_POINTER_REGNUM
live check with frame_pointer_needed_indeed when generating pro_and_epilogue
instructions.

Bootstrap and regression tested pass on Power8-LE.  Backport to gcc-9
required once approved.

gcc/ChangeLog

2020-04-17  Xiong Hu Luo  

PR target/91518
* config/rs6000/rs6000-logue.c (frame_pointer_needed_indeed):
New variable.
(rs6000_emit_prologue_components):
Check with frame_pointer_needed_indeed.
(rs6000_emit_epilogue_components): Likewise.
(rs6000_emit_epilogue): Likewise.
(rs6000_emit_prologue): Set frame_pointer_needed_indeed.
---
 gcc/config/rs6000/rs6000-logue.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/gcc/config/rs6000/rs6000-logue.c b/gcc/config/rs6000/rs6000-logue.c
index 4cbf228eb79..2213d1fa227 100644
--- a/gcc/config/rs6000/rs6000-logue.c
+++ b/gcc/config/rs6000/rs6000-logue.c
@@ -58,6 +58,8 @@ static bool rs6000_save_toc_in_prologue_p (void);
 
 static rs6000_stack_t stack_info;
 
+/* Set if HARD_FRAM_POINTER_REGNUM is really needed.  */
+static bool frame_pointer_needed_indeed = false;
 
 /* Label number of label created for -mrelocatable, to call to so we can
get the address of the GOT section */
@@ -2735,9 +2737,9 @@ void
 rs6000_emit_prologue_components (sbitmap components)
 {
   rs6000_stack_t *info = rs6000_stack_info ();
-  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
-? HARD_FRAME_POINTER_REGNUM
-: STACK_POINTER_REGNUM);
+  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed_indeed
+ ? HARD_FRAME_POINTER_REGNUM
+ : STACK_POINTER_REGNUM);
 
   machine_mode reg_mode = Pmode;
   int reg_size = TARGET_32BIT ? 4 : 8;
@@ -2815,9 +2817,9 @@ void
 rs6000_emit_epilogue_components (sbitmap components)
 {
   rs6000_stack_t *info = rs6000_stack_info ();
-  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
-? HARD_FRAME_POINTER_REGNUM
-: STACK_POINTER_REGNUM);
+  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed_indeed
+ ? HARD_FRAME_POINTER_REGNUM
+ : STACK_POINTER_REGNUM);
 
   machine_mode reg_mode = Pmode;
   int reg_size = TARGET_32BIT ? 4 : 8;
@@ -2996,7 +2998,10 @@ rs6000_emit_prologue (void)
&& (lookup_attribute ("no_split_stack",
  DECL_ATTRIBUTES (cfun->decl))
== NULL));
- 
+
+  frame_pointer_needed_indeed
+= frame_pointer_needed && df_regs_ever_live_p (HARD_FRAME_POINTER_REGNUM);
+
   /* Offset to top of frame for frame_reg and sp respectively.  */
   HOST_WIDE_INT frame_off = 0;
   HOST_WIDE_INT sp_off = 0;
@@ -3658,7 +3663,7 @@ rs6000_emit_prologue (void)
 }
 
   /* Set frame pointer, if needed.  */
-  if (frame_pointer_needed)
+  if (frame_pointer_needed_indeed)
 {
   insn = 

[PATCH] Fold (add -1; zero_ext; add +1) operations to zero_ext when not zero (PR37451, PR61837)

2020-04-15 Thread luoxhu--- via Gcc-patches
From: Xionghu Luo 

This "subtract/extend/add" existed for a long time and still annoying us
(PR37451, PR61837) when converting from 32bits to 64bits, as the ctr
register is used as 64bits on powerpc64, Andraw Pinski had a patch but
caused some issue and reverted by Joseph S. Myers(PR37451, PR37782).

Andraw:
http://gcc.gnu.org/ml/gcc-patches/2008-09/msg01070.html
http://gcc.gnu.org/ml/gcc-patches/2008-10/msg01321.html
Joseph:
https://gcc.gnu.org/legacy-ml/gcc-patches/2011-11/msg02405.html

However, the doloop code improved a lot since so many years passed,
gcc.c-torture/execute/doloop-1.c is no longer a simple loop with constant
desc->niter_expr since r125:SI#0 is not SImode, so it is not a valid doloop
and no transform done in doloop again.  Thus we can do the simplification
from "subtract/extend/add" to only extend as the condition in doloop will
never be false based on loop ch's optimization.
What's more, this patch is slightly different with Andrw's implementation,
the check of ZERO_EXT and SImode will guard the count won't be changed
from char/short caused cases not time out on slow platforms before.
Any comments?  Thanks.

doloop-1.c.257r.loop2_doloop
...
12: [r129:DI]=r123:SI
  REG_DEAD r129:DI
  REG_DEAD r123:SI
13: r125:SI=r120:DI#0-0x1
  REG_DEAD r120:DI
14: r120:DI=zero_extend(r125:SI#0)
  REG_DEAD r125:SI
16: r126:CC=cmp(r120:DI,0)
17: pc={(r126:CC!=0)?L43:pc}
  REG_DEAD r126:CC
...

Bootstrap and regression tested pass on Power8-LE.

gcc/ChangeLog

2020-04-15  Xiong Hu Luo  

PR rtl-optimization/37451, PR target/61837
loop-doloop.c (doloop_modify): Simplify (add -1; zero_ext; add +1)
to zero_ext.
---
 gcc/loop-doloop.c | 26 +-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/gcc/loop-doloop.c b/gcc/loop-doloop.c
index db6a014e43d..9f967fa3a0b 100644
--- a/gcc/loop-doloop.c
+++ b/gcc/loop-doloop.c
@@ -477,7 +477,31 @@ doloop_modify (class loop *loop, class niter_desc *desc,
 }
 
   if (increment_count)
-count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
+{
+  /* Fold (add -1; zero_ext; add +1) operations to zero_ext based on addop0
+is never zero, as gimple pass loop ch will do optimization to simplify
+the loop to NO loop for loop condition is false.  */
+  bool simplify_zext = false;
+  rtx extop0 = XEXP (count, 0);
+  if (mode == E_DImode
+ && GET_CODE (count) == ZERO_EXTEND
+ && GET_CODE (extop0) == PLUS)
+   {
+ rtx addop0 = XEXP (extop0, 0);
+ rtx addop1 = XEXP (extop0, 1);
+ if (CONST_SCALAR_INT_P (addop1)
+ && GET_MODE (addop0) == E_SImode
+ && addop1 == GEN_INT (-1))
+   {
+ count = simplify_gen_unary (ZERO_EXTEND, mode, addop0,
+ GET_MODE (addop0));
+ simplify_zext = true;
+   }
+   }
+
+  if (!simplify_zext)
+   count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
+}
 
   /* Insert initialization of the count register into the loop header.  */
   start_sequence ();
-- 
2.21.0.777.g83232e3864



[PATCH v2] rs6000: Don't use HARD_FRAME_POINTER_REGNUM if it's not live in pro_and_epilogue (PR91518)

2020-04-12 Thread luoxhu via Gcc-patches
This bug is exposed by FRE refactor of r263875.  Comparing the fre
dump file shows no obvious change of the segment fault function proves
it to be a target issue.
frame_pointer_needed is set to true in reload pass setup_can_eliminate,
but regs_ever_live[31] is false, pro_and_epilogue uses it without live
check causing CPU2006 465.tonto segment fault of loading from invalid
addresses due to r31 not saved/restored.  Thus, add HARD_FRAME_POINTER_REGNUM
live check with frame_pointer_needed_indeed_p when generating pro_and_epilogue
instructions.

Bootstrap and regression tested pass on Power8-LE.  Backport to gcc-9
required once approved.

gcc/ChangeLog

2020-04-13  Xiong Hu Luo  

PR target/91518
* config/rs6000/rs6000-logue.c (frame_pointer_needed_indeed_p):
New function.
(rs6000_emit_prologue_components):
Check with frame_pointer_needed_indeed_p.
(rs6000_emit_epilogue_components): Likewise.
(rs6000_emit_prologue): Likewise.
(rs6000_emit_epilogue): Likewise.
---
 gcc/config/rs6000/rs6000-logue.c | 24 
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/gcc/config/rs6000/rs6000-logue.c b/gcc/config/rs6000/rs6000-logue.c
index 4cbf228eb79..d17876ac0fb 100644
--- a/gcc/config/rs6000/rs6000-logue.c
+++ b/gcc/config/rs6000/rs6000-logue.c
@@ -2730,14 +2730,22 @@ rs6000_disqualify_components (sbitmap components, edge 
e,
 }
 }
 
+/* Determine whether HARD_FRAM_POINTER_REGNUM is really needed.  */
+static bool
+frame_pointer_needed_indeed_p (void)
+{
+  return frame_pointer_needed
+&& df_regs_ever_live_p (HARD_FRAME_POINTER_REGNUM);
+}
+
 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
 void
 rs6000_emit_prologue_components (sbitmap components)
 {
   rs6000_stack_t *info = rs6000_stack_info ();
-  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
-? HARD_FRAME_POINTER_REGNUM
-: STACK_POINTER_REGNUM);
+  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed_indeed_p ()
+ ? HARD_FRAME_POINTER_REGNUM
+ : STACK_POINTER_REGNUM);
 
   machine_mode reg_mode = Pmode;
   int reg_size = TARGET_32BIT ? 4 : 8;
@@ -2815,9 +2823,9 @@ void
 rs6000_emit_epilogue_components (sbitmap components)
 {
   rs6000_stack_t *info = rs6000_stack_info ();
-  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
-? HARD_FRAME_POINTER_REGNUM
-: STACK_POINTER_REGNUM);
+  rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed_indeed_p ()
+ ? HARD_FRAME_POINTER_REGNUM
+ : STACK_POINTER_REGNUM);
 
   machine_mode reg_mode = Pmode;
   int reg_size = TARGET_32BIT ? 4 : 8;
@@ -3658,7 +3666,7 @@ rs6000_emit_prologue (void)
 }
 
   /* Set frame pointer, if needed.  */
-  if (frame_pointer_needed)
+  if (frame_pointer_needed_indeed_p ())
 {
   insn = emit_move_insn (gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM),
 sp_reg_rtx);
@@ -4534,7 +4542,7 @@ rs6000_emit_epilogue (enum epilogue_type epilogue_type)
 }
   /* If we have a frame pointer, we can restore the old stack pointer
  from it.  */
-  else if (frame_pointer_needed)
+  else if (frame_pointer_needed_indeed_p ())
 {
   frame_reg_rtx = sp_reg_rtx;
   if (DEFAULT_ABI == ABI_V4)
-- 
2.21.0.777.g83232e3864



Re: [PATCH] rs6000: Don't split constant oprator when add, move to temp register for future optimization

2020-04-03 Thread luoxhu via Gcc-patches



On 2020/4/3 06:16, Segher Boessenkool wrote:
> Hi!
> 
> On Mon, Mar 30, 2020 at 11:59:57AM +0800, luoxhu wrote:
>>> Do we want something later in the RTL pipeline to make "addi"s etc. again?
> 
> (This would be a good thing to consider -- maybe a define_insn_and_split
> will work.  But see below).
> 
>> [PATCH] rs6000: Don't split constant operator add before reload, move to 
>> temp register for future optimization
>>
>> Don't split code from add3 for SDI to allow a later pass to split.
>> This allows later logic to hoist out constant load in add instructions.
>> In loop, lis+ori could be hoisted out to improve performance compared with
>> previous addis+addi (About 15% on typical case), weak point is
>> one more register is used and one more instruction is generated.  i.e.:
>>
>> addis 3,3,0x8765
>> addi 3,3,0x4321
>>
>> =>
>>
>> lis 9,0x8765
>> ori 9,9,0x4321
>> add 3,3,9
> 
> (This patch will of course have to wait for stage 1).
> 
> Such a define_insn_and_split could be for an add of a (signed) 32-bit
> immediate.  combine will try to combine the three insns (lis;ori;add),
> and match the new pattern.

Currently 286r.split2 will split "12:%9:DI=0x87654321" to lis+ori by
rs6000_emit_set_const of define_split, do you mean add new define_insn_and_split
to do the split?  Another patch to do this after this one goes upstream in 
stage 1?

> 
> This also links in with Alan's work on big immediates, and with paddi
> insns, etc.

Seems PR94393?  Yes, rs6000_emit_set_const calls rs6000_emit_set_long_const for 
DImode.
I tried unsigned long like 0xabcd87654321, 0xabcd87654321 and 
0xc000ULL, 
All of them are outside of loop even without my patch.  No difference with or 
without
Alan's patch.

0xabcd87654321: li 9,0  ori 9,9,0xabcd  sldi 9,9,32   oris 9,9,0x8765ori 
9,9,0x4321
0xabcd87654321: lis 9,0xabcd   ori 9,9,0x8765 sldi 9,9,16 ori 
9,9,0x4321
0xc000ULL:   li 9,-1   rldicr 9,9,0,1


Thanks,
Xionghu

> 
> 
> Segher
> 



Re: [PATCH] rs6000: Save/restore r31 if frame_pointer_needed is true

2020-03-29 Thread luoxhu via Gcc-patches



On 2020/3/28 00:04, Segher Boessenkool wrote:

Hi!

On Fri, Mar 27, 2020 at 09:34:00AM +0800, luoxhu wrote:

On 2020/3/27 07:59, Segher Boessenkool wrote:

On Wed, Mar 25, 2020 at 11:15:22PM -0500, luo...@linux.ibm.com wrote:

frame_pointer_needed is set to true in reload pass setup_can_eliminate,
but regs_ever_live[31] is false, so pro_and_epilogue doesn't save/restore
r31 even it is used actually, causing CPU2006 465.tonto segment fault of
loading from invalid addresses.


If df_regs_ever_live_p(31) is false there is no hard frame pointer
anywhere in the program.  How can it be used then?


There is a piece of code emit move instruction to r31 even 
df_regs_ever_live_p(31) is false
in pro_and_epilogue.


Can you point out where (in rs6000-logue.c ot similar)?  We should fix
*that*.


As frame_point_needed is true and frame_pointer_needed is widely
used in this function, so I propose to save r31 in save_reg_p instead of check
(frame_pointer_needed && df_regs_ever_live_p(31), I haven't verify whether this 
works yet).
Is this reasonable?  Thanks.


frame_pointer_needed is often true when the backend can figure out we do
not actually need it.


rs6000-logue.c
void
rs6000_emit_prologue (void)
{
...
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26840)   /* Set frame 
pointer, if needed.  */
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26841)   if 
(frame_pointer_needed)
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26842) {
0d6c02bf24e4 (jakub  2005-06-30 14:26:32 + 26843)   insn = 
emit_move_insn (gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM),
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26844)   
 sp_reg_rtx);
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26845)   
RTX_FRAME_RELATED_P (insn) = 1;
6b02f2a5c61e (meissner   1995-11-30 20:02:16 + 26846) }
d1bd513ed578 (kenner 1992-02-09 19:26:21 + 26847)
...
}


Ah, so this you mean.  I see.  It looks like if you change this to

   if (frame_pointer_needed && df_regs_ever_live_p (HARD_FRAME_POINTER_REGNUM))
 {
   insn = emit_move_insn (gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM),
 sp_reg_rtx);
   RTX_FRAME_RELATED_P (insn) = 1;
 }

(so just that "if" clause changes), it'll all be fine.  Could you test
that please?


Tried with below patch, it still fails at same place, I guess this is not 
enough.
The instruction in 100dc034 change r31 from 0x105fdd70 to 0x7fffbbf0 and 
didn't
restore it before return. 


100dc020 <__atomvec_module_MOD_atom_index_from_pos>:
   100dc020:   51 10 40 3c lis r2,4177
   100dc024:   00 7f 42 38 addir2,r2,32512
   100dc028:   28 00 e3 e8 ld  r7,40(r3)
   100dc02c:   e1 ff 21 f8 stdur1,-32(r1)
   100dc030:   00 00 27 2c cmpdi   r7,0
   100dc034:   78 0b 3f 7c mr  r31,r1
   100dc038:   08 00 82 40 bne 100dc040 
<__atomvec_module_MOD_atom_index_from_pos+0x20>
   100dc03c:   01 00 e0 38 li  r7,1
   100dc040:   38 00 43 e9 ld  r10,56(r3)
   100dc044:   30 00 23 e9 ld  r9,48(r3)
   100dc048:   00 00 03 e9 ld  r8,0(r3)


Diff:

diff --git a/gcc/config/rs6000/rs6000-logue.c b/gcc/config/rs6000/rs6000-logue.c
index 4cbf228eb79..28fda1866d8 100644
--- a/gcc/config/rs6000/rs6000-logue.c
+++ b/gcc/config/rs6000/rs6000-logue.c
@@ -3658,7 +3658,7 @@ rs6000_emit_prologue (void)
}

  /* Set frame pointer, if needed.  */
-  if (frame_pointer_needed)
+  if (frame_pointer_needed && df_regs_ever_live_p (HARD_FRAME_POINTER_REGNUM))
{
  insn = emit_move_insn (gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM),
sp_reg_rtx);
@@ -4534,7 +4534,8 @@ rs6000_emit_epilogue (enum epilogue_type epilogue_type)
}
  /* If we have a frame pointer, we can restore the old stack pointer
 from it.  */
-  else if (frame_pointer_needed)
+  else if (frame_pointer_needed
+  && df_regs_ever_live_p (HARD_FRAME_POINTER_REGNUM))
{
  frame_reg_rtx = sp_reg_rtx;
  if (DEFAULT_ABI == ABI_V4)
@@ -4873,7 +4874,8 @@ rs6000_emit_epilogue (enum epilogue_type epilogue_type)
a REG_CFA_DEF_CFA note, but that's OK;  A duplicate is
discarded by dwarf2cfi.c/dwarf2out.c, and in any case would
be harmless if emitted.  */
-  if (frame_pointer_needed)
+  if (frame_pointer_needed
+ && df_regs_ever_live_p (HARD_FRAME_POINTER_REGNUM))
   {
 insn = get_last_insn ();
 add_reg_note (insn, REG_CFA_DEF_CFA,



Program received signal SIGSEGV: Segmentation fault - invalid memory reference.

Backtrace for this error:
#0  0x200a2243 in ???
#1  0x200a0abb in ???
#2  0x200604d7 in ???
#3  0x1027418c in __mol_module_MOD_make_image_of_shell
   at 
/home/luoxhu/workspace/cpu2006/benchspec/CPU2006/465.tonto/build/build_peak_none./mol.fppized.f90:9376
#4  0x10281adf in __mol_module_MOD_symmetrise_r
   at 

Re: [PATCH] rs6000: Don't split constant oprator when add, move to temp register for future optimization

2020-03-29 Thread luoxhu via Gcc-patches



On 2020/3/27 22:33, Segher Boessenkool wrote:
> Hi!
> 
> On Thu, Mar 26, 2020 at 05:06:43AM -0500, luo...@linux.ibm.com wrote:
>> Remove split code from add3 to allow a later pass to split.
>> This allows later logic to hoist out constant load in add instructions.
>> In loop, lis+ori could be hoisted out to improve performance compared with
>> previous addis+addi (About 15% on typical case), weak point is
>> one more register is used and one more instruction is generated.  i.e.:
>>
>> addis 3,3,0x8765
>> addi 3,3,0x4321
>>
>> =>
>>
>> lis 9,0x8765
>> ori 9,9,0x4321
>> add 3,3,9
> 
> What does it do overall?  Say, to SPEC.  What does it do to execution
> time, and what does it do to binary size?
> 
> Do we want something later in the RTL pipeline to make "addi"s etc. again?
> 
>> 2020-03-26  Xiong Hu Luo  
>>
>>  * config/rs6000/rs6000.md (add3): Remove split code, move constant
>>to temp register before add.
> 
> This should not be indented, so just:
>   * config/rs6000/rs6000.md (add3): Remove split code, move constant
>   to temp register before add.
> 
> We have six separate add3 patterns (three of those are in rs6000.md,
> too).  You can write something like
>   (add3 for SDI):
> to show which iterator (or mode) this one is for.  This is helpful with
> any  or  or the like, even if there (currently) is only one
> pattern you could mean.
> 

Thanks, it is necessary to re-enable split add as some later RTL passes like 
final will
still need generate addis+addi (case: g++.dg/opt/thunk1.C ). 
Update the patch as below:


[PATCH] rs6000: Don't split constant operator add before reload, move to temp 
register for future optimization

Don't split code from add3 for SDI to allow a later pass to split.
This allows later logic to hoist out constant load in add instructions.
In loop, lis+ori could be hoisted out to improve performance compared with
previous addis+addi (About 15% on typical case), weak point is
one more register is used and one more instruction is generated.  i.e.:

addis 3,3,0x8765
addi 3,3,0x4321

=>

lis 9,0x8765
ori 9,9,0x4321
add 3,3,9

No obvious performance and binary size change to SPEC2017.

gcc/ChangeLog:

2020-03-30  Xiong Hu Luo  

* config/rs6000/rs6000.md (add3 for SDI): Don't split before 
reload,
move constant to temp register for add.

gcc/testsuite/ChangeLog:

2020-03-26  Xiong Hu Luo  

* gcc.target/powerpc/add-const.c: New.
---
 gcc/config/rs6000/rs6000.md  | 51 +---
 gcc/testsuite/gcc.target/powerpc/add-const.c | 18 +++
 2 files changed, 42 insertions(+), 27 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/add-const.c

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index ad88b6783af..76af3d5b1d9 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -1729,34 +1729,31 @@ (define_expand "add3"
 
   if (CONST_INT_P (operands[2]) && !add_operand (operands[2], mode))
 {
-  rtx tmp = ((!can_create_pseudo_p ()
- || rtx_equal_p (operands[0], operands[1]))
-? operands[0] : gen_reg_rtx (mode));
-
-  /* Adding a constant to r0 is not a valid insn, so use a different
-strategy in that case.  */
-  if (reg_or_subregno (operands[1]) == 0 || reg_or_subregno (tmp) == 0)
-   {
- if (operands[0] == operands[1])
-   FAIL;
- rs6000_emit_move (operands[0], operands[2], mode);
- emit_insn (gen_add3 (operands[0], operands[1], operands[0]));
- DONE;
-   }
-
-  HOST_WIDE_INT val = INTVAL (operands[2]);
-  HOST_WIDE_INT low = ((val & 0x) ^ 0x8000) - 0x8000;
-  HOST_WIDE_INT rest = trunc_int_for_mode (val - low, mode);
-
-  if (mode == DImode && !satisfies_constraint_L (GEN_INT (rest)))
-   FAIL;
+  if (can_create_pseudo_p ())
+  {
+   rtx tmp = rtx_equal_p (operands[0], operands[1])
+ ? operands[0] : gen_reg_rtx (mode);
 
-  /* The ordering here is important for the prolog expander.
-When space is allocated from the stack, adding 'low' first may
-produce a temporary deallocation (which would be bad).  */
-  emit_insn (gen_add3 (tmp, operands[1], GEN_INT (rest)));
-  emit_insn (gen_add3 (operands[0], tmp, GEN_INT (low)));
-  DONE;
+   rs6000_emit_move (tmp, operands[2], mode);
+   emit_insn (gen_add3 (operands[0], operands[1], tmp));
+   DONE;
+  }
+  else
+  {
+   HOST_WIDE_INT val = INTVAL (operands[2]);
+   HOST_WIDE_INT low = ((val & 0x) ^ 0x8000) - 0x8000;
+   HOST_WIDE_INT rest = trunc_int_for_mode (val - low, mode);
+
+   if (mode == DImode && !satisfies_constraint_L (GEN_INT (rest)))
+ FAIL;
+
+   /* The ordering here is important for the prolog expander.
+  When space is allocated from the stack, adding 'low' first may
+  produce a temporary deallocation (which would be bad).  */
+   emit_insn 

Re: [PATCH] rs6000: Save/restore r31 if frame_pointer_needed is true

2020-03-26 Thread luoxhu via Gcc-patches



On 2020/3/27 07:59, Segher Boessenkool wrote:
> Hi!
> 
> On Wed, Mar 25, 2020 at 11:15:22PM -0500, luo...@linux.ibm.com wrote:
>> frame_pointer_needed is set to true in reload pass setup_can_eliminate,
>> but regs_ever_live[31] is false, so pro_and_epilogue doesn't save/restore
>> r31 even it is used actually, causing CPU2006 465.tonto segment fault of
>> loading from invalid addresses.
> 
> If df_regs_ever_live_p(31) is false there is no hard frame pointer
> anywhere in the program.  How can it be used then?

There is a piece of code emit move instruction to r31 even 
df_regs_ever_live_p(31) is false
in pro_and_epilogue.  As frame_point_needed is true and frame_pointer_needed is 
widely
used in this function, so I propose to save r31 in save_reg_p instead of check
(frame_pointer_needed && df_regs_ever_live_p(31), I haven't verify whether this 
works yet).
Is this reasonable?  Thanks.
 
rs6000-logue.c
void
rs6000_emit_prologue (void)
{
...
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26840)   /* Set frame 
pointer, if needed.  */
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26841)   if 
(frame_pointer_needed)
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26842) {
0d6c02bf24e4 (jakub  2005-06-30 14:26:32 + 26843)   insn = 
emit_move_insn (gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM),
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26844)   
 sp_reg_rtx);
bbd21807fdf6 (geoffk 2000-03-16 03:16:41 + 26845)   
RTX_FRAME_RELATED_P (insn) = 1;
6b02f2a5c61e (meissner   1995-11-30 20:02:16 + 26846) }
d1bd513ed578 (kenner 1992-02-09 19:26:21 + 26847)
...
}

Xionghu

> 
> 
> Segher
> 



[PATCH] rs6000: Don't split constant oprator when add, move to temp register for future optimization

2020-03-26 Thread luoxhu--- via Gcc-patches
From: Xionghu Luo 

Remove split code from add3 to allow a later pass to split.
This allows later logic to hoist out constant load in add instructions.
In loop, lis+ori could be hoisted out to improve performance compared with
previous addis+addi (About 15% on typical case), weak point is
one more register is used and one more instruction is generated.  i.e.:

addis 3,3,0x8765
addi 3,3,0x4321

=>

lis 9,0x8765
ori 9,9,0x4321
add 3,3,9

gcc/ChangeLog:

2020-03-26  Xiong Hu Luo  

* config/rs6000/rs6000.md (add3): Remove split code, move constant
  to temp register before add.

gcc/testsuite/ChangeLog:

2020-03-26  Xiong Hu Luo  

* gcc.target/powerpc/add-const.c: New.
---
 gcc/config/rs6000/rs6000.md  | 25 ++--
 gcc/testsuite/gcc.target/powerpc/add-const.c | 18 ++
 2 files changed, 20 insertions(+), 23 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/add-const.c

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index ad88b6783af..72f3f604e0d 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -1733,29 +1733,8 @@ (define_expand "add3"
  || rtx_equal_p (operands[0], operands[1]))
 ? operands[0] : gen_reg_rtx (mode));
 
-  /* Adding a constant to r0 is not a valid insn, so use a different
-strategy in that case.  */
-  if (reg_or_subregno (operands[1]) == 0 || reg_or_subregno (tmp) == 0)
-   {
- if (operands[0] == operands[1])
-   FAIL;
- rs6000_emit_move (operands[0], operands[2], mode);
- emit_insn (gen_add3 (operands[0], operands[1], operands[0]));
- DONE;
-   }
-
-  HOST_WIDE_INT val = INTVAL (operands[2]);
-  HOST_WIDE_INT low = ((val & 0x) ^ 0x8000) - 0x8000;
-  HOST_WIDE_INT rest = trunc_int_for_mode (val - low, mode);
-
-  if (mode == DImode && !satisfies_constraint_L (GEN_INT (rest)))
-   FAIL;
-
-  /* The ordering here is important for the prolog expander.
-When space is allocated from the stack, adding 'low' first may
-produce a temporary deallocation (which would be bad).  */
-  emit_insn (gen_add3 (tmp, operands[1], GEN_INT (rest)));
-  emit_insn (gen_add3 (operands[0], tmp, GEN_INT (low)));
+  rs6000_emit_move (tmp, operands[2], mode);
+  emit_insn (gen_add3 (operands[0], operands[1], tmp));
   DONE;
 }
 })
diff --git a/gcc/testsuite/gcc.target/powerpc/add-const.c 
b/gcc/testsuite/gcc.target/powerpc/add-const.c
new file mode 100644
index 000..e1007247b32
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/add-const.c
@@ -0,0 +1,18 @@
+/* { dg-do compile { target { lp64 } } } */
+/* { dg-options "-O3 -fno-unroll-loops" } */
+
+/* Ensure the lis,ori are generated, which indicates they have
+   been hoisted outside of the loop.  */
+
+typedef unsigned long ulong;
+ulong
+foo (ulong n, ulong h)
+{
+  int i;
+  for (i = 0; i < n; i++)
+h = ((h + 8) | h) + 0x87654321;
+  return h;
+}
+
+/* { dg-final { scan-assembler-times {\mlis\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mori\M} 1 } } */
-- 
2.21.0.777.g83232e3864



[PATCH] rs6000: Save/restore r31 if frame_pointer_needed is true

2020-03-25 Thread luoxhu--- via Gcc-patches
From: Xionghu Luo 

This P1 bug is exposed by FRE refactor of r263875.  Comparing the fre
dump file shows no obvious change of the segment fault function proves
it to be a target issue.
frame_pointer_needed is set to true in reload pass setup_can_eliminate,
but regs_ever_live[31] is false, so pro_and_epilogue doesn't save/restore
r31 even it is used actually, causing CPU2006 465.tonto segment fault of
loading from invalid addresses.

Bootstrap and regression tested pass on Power8-LE.  Backport to gcc-9
required once approved.

gcc/ChangeLog

2020-03-26  Xiong Hu Luo  

PR target/91518
* config/rs6000/rs6000-logue.c (save_reg_p): Save r31 if
frame_pointer_needed.
---
 gcc/config/rs6000/rs6000-logue.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gcc/config/rs6000/rs6000-logue.c b/gcc/config/rs6000/rs6000-logue.c
index 4cbf228eb79..7b62ff03afd 100644
--- a/gcc/config/rs6000/rs6000-logue.c
+++ b/gcc/config/rs6000/rs6000-logue.c
@@ -116,6 +116,9 @@ save_reg_p (int reg)
return true;
 }
 
+  if (reg == HARD_FRAME_POINTER_REGNUM && frame_pointer_needed)
+return true;
+
   return !call_used_or_fixed_reg_p (reg) && df_regs_ever_live_p (reg);
 }
 
-- 
2.21.0.777.g83232e3864