Re: RFA: Add lock_lenth attribute to support the ARC port (Was: Re: Ping: RFA: add lock_length attribute to break branch-shortening cycles)

2012-10-24 Thread Joseph S. Myers
On Tue, 23 Oct 2012, Joern Rennecke wrote:

> I'll be posting the ARC port shortly; it does not fit into a single 100 KB
> posting, so I'm thinking of splitting it in a configury patch and zx
> compressed files/tarballs for arc.c, arc.md, libgcc, and the rest of the port.

The size limit is 400 kB, not 100 kB.  Hopefully that means you don't need 
to compress so many bits and can attach more as plain patches for future 
revisions.

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: RFA: Add lock_lenth attribute to support the ARC port (Was: Re: Ping: RFA: add lock_length attribute to break branch-shortening cycles)

2012-10-24 Thread Joern Rennecke

Quoting Richard Biener :


Just to add some extra information, can you quote your ports
ADJUST_INSN_LENGTH and one example instruction with length/lock_length
attribute the above applies to?


This is a bit besides the point, since lock_length does mostly the traditional
branch shortening (with alignment effects rolled in for branches - I'm not
really happy about this, but that was required for convergence), length
does alignment effects on top, and ADJUST_INSN_LENGTH does random
context-sensitive adjustments related to pipeline quirks and conditional
execution.

I'm afraid this gcc port's logic in this area is both more complicated than
what you would expect in an average gcc port, and much more simplistic than
what a competent assembler programmer uses to optimize code for these
microarchitectures.

from arc.h:

#define ADJUST_INSN_LENGTH(X, LENGTH)   \
  (LENGTH) += arc_adjust_insn_length ((X), (LENGTH))

from arc.c:

/* Return length adjustment for INSN.  */
int
arc_adjust_insn_length (rtx insn, int len)
{
  int adj = 0;

  if (!INSN_P (insn))
return 0;
  if (GET_CODE (PATTERN (insn)) == SEQUENCE)
{
  int adj0, adj1, len1;
  rtx pat = PATTERN (insn);
  rtx i0 = XVECEXP (pat, 0, 0);
  rtx i1 = XVECEXP (pat, 0, 1);

  len1 = get_attr_lock_length (i1);
  gcc_assert (!len || len >= 4 || (len == 2 && get_attr_iscompact (i1)));
  if (!len1)
len1 = get_attr_iscompact (i1) != ISCOMPACT_FALSE ? 2 : 4;
  adj0 = arc_adjust_insn_length (i0, len - len1);
  adj1 = arc_adjust_insn_length (i1, len1);
  return adj0 + adj1;
}
  if (recog_memoized (insn) == CODE_FOR_doloop_end_i)
{
  rtx prev = prev_nonnote_insn (insn);

  return ((LABEL_P (prev)
   || (TARGET_ARC600
   && (JUMP_P (prev) || GET_CODE (PATTERN (prev)) ==  
SEQUENCE)))

  ? 4 : 0);
}

  /* Check for return with but one preceding insn since function
 start / call.  */
  if (TARGET_PAD_RETURN
  && JUMP_P (insn)
  && GET_CODE (PATTERN (insn)) != ADDR_VEC
  && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC
  && get_attr_type (insn) == TYPE_RETURN)
{
  rtx prev = prev_active_insn (insn);

  if (!prev || !(prev = prev_active_insn (prev))
  || ((NONJUMP_INSN_P (prev)
   && GET_CODE (PATTERN (prev)) == SEQUENCE)
  ? CALL_ATTR (XVECEXP (PATTERN (prev), 0, 0), NON_SIBCALL)
  : CALL_ATTR (prev, NON_SIBCALL)))
return 4;
}
  /* Rtl changes too much before arc_reorg to keep ccfsm state.
 But we are not required to give exact answers then.  */
  if (cfun->machine->arc_reorg_started
  && (JUMP_P (insn) || (len & 2)))
{
  struct arc_ccfsm *statep = &cfun->machine->ccfsm_current;

  arc_ccfsm_advance_to (insn);
  switch (statep->state)
{
case 0:
  break;
case 1: case 2:
  /* Deleted branch.  */
  return -len;
case 3: case 4: case 5:
  /* Conditionalized insn.  */
  if ((!JUMP_P (insn)
   || (get_attr_type (insn) != TYPE_BRANCH
   && get_attr_type (insn) != TYPE_UNCOND_BRANCH
   && (get_attr_type (insn) != TYPE_RETURN
   || (statep->cc != ARC_CC_EQ && statep->cc != ARC_CC_NE)
   || NEXT_INSN (PREV_INSN (insn)) != insn)))
  && (len & 2))
adj = 2;
  break;
default:
  gcc_unreachable ();
}
}
  if (TARGET_ARC600)
{
  rtx succ = next_real_insn (insn);

  if (succ && INSN_P (succ))
adj += arc600_corereg_hazard (insn, succ);
}

  /* Restore extracted operands - otherwise splitters like the  
addsi3_mixed one

 can go awry.  */
  extract_constrain_insn_cached (insn);

  return adj;
}

From arc.md:

; Since the demise of REG_N_SETS, it is no longer possible to find out
; in the prologue / epilogue expanders how many times blink is set.
; Using df_regs_ever_live_p to decide if blink needs saving means that
; any explicit use of blink will cause it to be saved; hence we cannot
; represent the blink use in return / sibcall instructions themselves, and
; instead have to show it in EPILOGUE_USES and must explicitly
; forbid instructions that change blink in the return / sibcall delay slot.
(define_insn "return_i"
  [(return)]
  "reload_completed"
{
  rtx reg
= gen_rtx_REG (Pmode,
   arc_return_address_regs[arc_compute_function_type (cfun)]);

  if (TARGET_PAD_RETURN)
arc_pad_return ();
  output_asm_insn (\"j%!%* [%0]%&\", ®);
  return \"\";
}
  [(set_attr "type" "return")
   (set_attr "cond" "canuse")
   (set (attr "iscompact")
(cond [(eq (symbol_ref "arc_compute_function_type (cfun)")
   (symbol_ref "ARC_FUNCTION_NORMAL"))
   (const_string "maybe")]
  (const_string "false")))
   (set (attr "length")
(cond [(eq (symbol_ref "arc

Re: RFA: Add lock_lenth attribute to support the ARC port (Was: Re: Ping: RFA: add lock_length attribute to break branch-shortening cycles)

2012-10-24 Thread Richard Biener
On Wed, Oct 24, 2012 at 3:42 AM, Joern Rennecke
 wrote:
> Quoting Richard Biener :
>
>> On Tue, Oct 16, 2012 at 9:35 PM, Joern Rennecke
>>  wrote:
>
> ..
>>>
>>> Well, we could split it anyway, and give ports without the need for
>>> multiple length attributes the benefit of the optimistic algorithm.
>>>
>>> I have attached a patch that implements this.
>>
>>
>> Looks reasonable to me, though I'm not familiar enough with the code
>> to approve it.
>
>
> Now that Richard Sandiford has reviewed that split-off part and it's in
> the source tree, we can return to the remaining functionality needed
> by for the ARC port.
>
>> I'd strongly suggest to try harder to make things work for you without
>> the new attribute even though I wasn't really able to follow your
>> reasoning
>> on why that wouldn't work.  It may be easier to motivate this change
>> once the port is in without that attribute so one can actually look at
>> the machine description and port details.
>
>
> Well, it doesn't simply drop in with the existing branch shortening -
> libgcc won't compile because of out-of-range branches.
> I tried to lump the length and lock_length atribute together, and that
> just gives genattrtab indigestion.  It sits there looping forever.
> I could start debugging this, but that would take an unknown amount of
> time, and then the review of the fix would take an unknown amount of time,
> and then the ARC port probably needs fixing up again because it just
> doesn't work right with these fudged lengths.  And even if we could get
> everything required in before the close of phase 1, the branch shortening
> would be substandard.
> It seems more productive to get the branch shortening working now.
> The lock_length atrtibute is completely optional, so no port maintainer
> would be forced to use the functionality if it's not desired.
>
> The issue is that the some instructions need to be aligned or unaligned
> for performance or in a few cases even for correctness.  Just inserting
> random nops would be a waste of code space and cycles, since most of the
> time, the desired (mis)alignment can be archived by selectively making
> a short instruction long.  If an instruction that is long once was forced
> to stay long henceforth, that would actually defeat the purpose of getting
> the desired alignment.  Then another short instruction - if it can be found
> -
> would need to be upsized.  So a size increase could ripple through as
> alignments are distorted.  The natural thing to do is really when the
> alignemnt changes is really to let the upsized instruction be short again.
> Only length-determined branch sizes have to be locked to avoid cycles.

Just to add some extra information, can you quote your ports
ADJUST_INSN_LENGTH and one example instruction with length/lock_length
attribute the above applies to?

> This is the documentation for the new role of the lock_length attribute
> (reduced from my previous attempt):
>
> @cindex lock_length
> Usually, when doing optimizing branch shortening, the instruction length
> is calculated by avaluating the @code{length} attribute, applying
> @code{ADJUST_INSN_LENGTH}, and taking the maximum of the resultant
> value and the length of the instruction in the previous iteration.

Which sounds straight-forward.  The docs of ADJUST_INSN_LENGTH
are not entirely clear, but it sounds like it may only increase length, correct?
I see that ADJUST_INSN_LENGTH is really not needed as everything
should be expressable in the length attribute of an instruction?

> If you define the @code{lock_length} attribute, the @code{lock_length}
> attribute will be evaluated, and then the maximum with of @code{lock_length}

with of?  I read it as 'of the'

> value from the previous iteration will be formed and saved.

So lock_length will only increase during iteration.

> Then the maximum of that value with the @code{length} attribute will
> be formed, and @code{ADJUST_INSN_LENGTH} will be applied.

ADJUST_INSN_LENGTH will be applied to the maximum?  What will
be the 'instruction length' equivalent to the simple non-lock_length case?
Is it the above, max (ADJUST_INSN_LENGTH (lock-length-max, length))?

> Thus, you can allow the length to vary downwards as well as upwards
> across iterations with suitable definitions of the @code{length} attribute
> and/or @code{ADJUST_INSN_LENGTH}.  Care has to be taken that this does not
> lead to infinite loops.

I don't see that you can shrink length with just suitable lock_length and
length attributes.  What seems to be the cruical difference is that you
apply ADJUST_INSN_LENGTH after combining lock-length-max and length.
But then you _decrease_ length with ADJUST_INSN_LENGHT ...

Maybe what you really want is ADJUST_INSN_LENGTH_AFTER which is
applied afterwards?  Thus,

> Usually, when doing optimizing branch shortening, the instruction length
> is calculated by avaluating the @code{length} attribute, applying
> @code{ADJUST_INSN_LENGTH}, and taking the maximum of the re

RFA: Add lock_lenth attribute to support the ARC port (Was: Re: Ping: RFA: add lock_length attribute to break branch-shortening cycles)

2012-10-23 Thread Joern Rennecke

Quoting Richard Biener :


On Tue, Oct 16, 2012 at 9:35 PM, Joern Rennecke
 wrote:

..

Well, we could split it anyway, and give ports without the need for
multiple length attributes the benefit of the optimistic algorithm.

I have attached a patch that implements this.


Looks reasonable to me, though I'm not familiar enough with the code
to approve it.


Now that Richard Sandiford has reviewed that split-off part and it's in
the source tree, we can return to the remaining functionality needed
by for the ARC port.


I'd strongly suggest to try harder to make things work for you without
the new attribute even though I wasn't really able to follow your reasoning
on why that wouldn't work.  It may be easier to motivate this change
once the port is in without that attribute so one can actually look at
the machine description and port details.


Well, it doesn't simply drop in with the existing branch shortening -
libgcc won't compile because of out-of-range branches.
I tried to lump the length and lock_length atribute together, and that
just gives genattrtab indigestion.  It sits there looping forever.
I could start debugging this, but that would take an unknown amount of
time, and then the review of the fix would take an unknown amount of time,
and then the ARC port probably needs fixing up again because it just
doesn't work right with these fudged lengths.  And even if we could get
everything required in before the close of phase 1, the branch shortening
would be substandard.
It seems more productive to get the branch shortening working now.
The lock_length atrtibute is completely optional, so no port maintainer
would be forced to use the functionality if it's not desired.

The issue is that the some instructions need to be aligned or unaligned
for performance or in a few cases even for correctness.  Just inserting
random nops would be a waste of code space and cycles, since most of the
time, the desired (mis)alignment can be archived by selectively making
a short instruction long.  If an instruction that is long once was forced
to stay long henceforth, that would actually defeat the purpose of getting
the desired alignment.  Then another short instruction - if it can be found -
would need to be upsized.  So a size increase could ripple through as
alignments are distorted.  The natural thing to do is really when the
alignemnt changes is really to let the upsized instruction be short again.
Only length-determined branch sizes have to be locked to avoid cycles.

This is the documentation for the new role of the lock_length attribute
(reduced from my previous attempt):

@cindex lock_length
Usually, when doing optimizing branch shortening, the instruction length
is calculated by avaluating the @code{length} attribute, applying
@code{ADJUST_INSN_LENGTH}, and taking the maximum of the resultant
value and the length of the instruction in the previous iteration.

If you define the @code{lock_length} attribute, the @code{lock_length}
attribute will be evaluated, and then the maximum with of @code{lock_length}
value from the previous iteration will be formed and saved.
Then the maximum of that value with the @code{length} attribute will
be formed, and @code{ADJUST_INSN_LENGTH} will be applied.
Thus, you can allow the length to vary downwards as well as upwards
across iterations with suitable definitions of the @code{length} attribute
and/or @code{ADJUST_INSN_LENGTH}.  Care has to be taken that this does not
lead to infinite loops.

The new patch builds on this patch:
http://gcc.gnu.org/ml/gcc-patches/2012-10/msg01890.html
as a prerequisite.

build tested with libraries in revision 192654 for i686-pc-linux-gnu X  
mipsel-elf .

bootstrapped in revision 192703 on i686-pc-linux-gnu;
I've also successfully run config-list.mk with the patch applied to this
revision.  The following ports had pre-existing failures, which are
documented in the sub-PRs or PR47093/PR44756:

alpha64-dec-vms alpha-dec-vms am33_2.0-linux arm-netbsdelf arm-wrs-vxworks
avr-elf avr-rtems c6x-elf c6x-uclinux cr16-elf fr30-elf
i686-interix3 --enable-obsolete i686-openbsd3.0 i686-pc-msdosdjgpp
ia64-hp-vms iq2000-elf lm32-elf lm32-rtems lm32-uclinux mep-elf
microblaze-elf microblaze-linux mn10300-elf moxie-elf moxie-rtems
moxie-uclinux pdp11-aout picochip-elf --enable-obsolete rl78-elf
rx-elf score-elf --enable-obsolete sh5el-netbsd sh64-elf --with-newlib
sh64-linux sh64-netbsd sh-elf shle-linux sh-netbsdelf sh-rtems sh-superh-elf
sh-wrs-vxworks tilegx-linux-gnu tilepro-linux-gnu vax-linux-gnu vax-netbsdelf
vax-openbsd x86_64-knetbsd-gnu


I'll be posting the ARC port shortly; it does not fit into a single  
100 KB posting, so I'm thinking of splitting it in a configury patch  
and zx

compressed files/tarballs for arc.c, arc.md, libgcc, and the rest of the port.
2012-10-22  Joern Rennecke  

* doc/md.texi (node Defining Attributes): Add lock_length to
table of special attributes.
(node Insn Lengths): Document lock_length attrib

Re: Ping: RFA: add lock_length attribute to break branch-shortening cycles

2012-10-20 Thread Richard Sandiford
Joern Rennecke  writes:
> Quoting Richard Sandiford :
>> I think instead the set-up loop should have:
>>
>>   if (GET_CODE (body) == ADDR_VEC || GET_CODE (body) == ADDR_DIFF_VEC)
>>  {
>> #ifdef CASE_VECTOR_SHORTEN_MODE
>>if (increasing && GET_CODE (body) == ADDR_DIFF_VEC)
>>  PUT_MODE (body, CASE_VECTOR_SHORTEN_MODE (0, 0, body));
>> #endif
>>/* This only takes room if read-only data goes into the text
>>   section.  */
>>if (JUMP_TABLES_IN_TEXT_SECTION
>>|| readonly_data_section == text_section)
>>  insn_lengths[uid] = (XVECLEN (body,
>>GET_CODE (body) == ADDR_DIFF_VEC)
>>   * GET_MODE_SIZE (GET_MODE (body)));
>>/* Alignment is handled by ADDR_VEC_ALIGN.  */
>>  }
>>
>> (with just the CASE_VECTOR_SHORTEN_MODE part being new).
>> We then start with the most optimistic length possible,
>> as with everything else.
>
> Well, ports could always tailor the initial mode with CASE_VECTOR_MODE,
> but it is indeed simpler when the branch shortening pass provide a
> sensible initialization.

That, plus I think CASE_VECTOR_MODE should always be the conservatively
correct mode.

> How about putting this at the end of this block:
> #ifdef CASE_VECTOR_SHORTEN_MODE
>if (optimize)
>  {
>/* Look for ADDR_DIFF_VECs, and initialize their minimum and maximum
>   label fields.  */

OK, that does sound better.

> With regards to in what way it would make sense to change increasing to
> something other than optimize, I think we could have a flag to control
> this, which is turned on by default at -O1; it could then be turned off
> if people want only some other (quicker?) optimizations, or if they want
> to work around a machine-specific bug triggered by a single source file.
> I.e. optimize might be set when increasing is not.  Vice versa it makes
> little sense - iterating branch shortening is certainly an optimization.
> So is using CASE_VECTOR_SHORTEN_MODE, but it does not require iterating,
> as we've already calculated the required addresses in the preliminary pass
> before the main loop.
> So it makes sense to keep the condition for CASE_VECTOR_SHORTEN_MODE
> as 'optimize' (or use a different flag, e.g. case_vector_shorten_p),
> and have this at the end of this initial CASE_VECTOR_SHORTEN_MODE block:
>
>flags.min_after_base = min > rel;
>flags.max_after_base = max > rel;
>ADDR_DIFF_VEC_FLAGS (pat) = flags;
>
> + if (increasing)
> +   PUT_MODE (body, CASE_VECTOR_SHORTEN_MODE (0, 0, body));
>  }
>  }
> #endif /* CASE_VECTOR_SHORTEN_MODE */
>
>continue;
>  }
> #endif /* CASE_VECTOR_SHORTEN_MODE */
>
>
>
>> The main shortening if statement should then be conditional on:
>>
>> #ifdef CASE_VECTOR_SHORTEN_MODE
>>if (increasing
>>&& JUMP_P (insn)
>>&& GET_CODE (PATTERN (insn)) == ADDR_DIFF_VEC)
>
> As explained above, CASE_VECTOR_SHORTEN_MODE can make in the
> decreasing/non-iterating branch shortening mode.
>
> so I think the code for changing the mode should be
> if (!increasing
>|| (GET_MODE_SIZE (vec_mode)
>>= GET_MODE_SIZE (GET_MODE (body
>  PUT_MODE (body, vec_mode);

OK, sounds good.

Richard


Re: Ping: RFA: add lock_length attribute to break branch-shortening cycles

2012-10-20 Thread Joern Rennecke

Quoting Richard Sandiford :


I think instead the set-up loop should have:

  if (GET_CODE (body) == ADDR_VEC || GET_CODE (body) == ADDR_DIFF_VEC)
{
#ifdef CASE_VECTOR_SHORTEN_MODE
  if (increasing && GET_CODE (body) == ADDR_DIFF_VEC)
PUT_MODE (body, CASE_VECTOR_SHORTEN_MODE (0, 0, body));
#endif
  /* This only takes room if read-only data goes into the text
 section.  */
  if (JUMP_TABLES_IN_TEXT_SECTION
  || readonly_data_section == text_section)
insn_lengths[uid] = (XVECLEN (body,
  GET_CODE (body) == ADDR_DIFF_VEC)
 * GET_MODE_SIZE (GET_MODE (body)));
  /* Alignment is handled by ADDR_VEC_ALIGN.  */
}

(with just the CASE_VECTOR_SHORTEN_MODE part being new).
We then start with the most optimistic length possible,
as with everything else.


Well, ports could always tailor the initial mode with CASE_VECTOR_MODE,
but it is indeed simpler when the branch shortening pass provide a
sensible initialization.

How about putting this at the end of this block:
#ifdef CASE_VECTOR_SHORTEN_MODE
  if (optimize)
{
  /* Look for ADDR_DIFF_VECs, and initialize their minimum and maximum
 label fields.  */

Of course increasing would have to be set earlier.

With regards to in what way it would make sense to change increasing to
something other than optimize, I think we could have a flag to control
this, which is turned on by default at -O1; it could then be turned off
if people want only some other (quicker?) optimizations, or if they want
to work around a machine-specific bug triggered by a single source file.
I.e. optimize might be set when increasing is not.  Vice versa it makes
little sense - iterating branch shortening is certainly an optimization.
So is using CASE_VECTOR_SHORTEN_MODE, but it does not require iterating,
as we've already calculated the required addresses in the preliminary pass
before the main loop.
So it makes sense to keep the condition for CASE_VECTOR_SHORTEN_MODE
as 'optimize' (or use a different flag, e.g. case_vector_shorten_p),
and have this at the end of this initial CASE_VECTOR_SHORTEN_MODE block:

  flags.min_after_base = min > rel;
  flags.max_after_base = max > rel;
  ADDR_DIFF_VEC_FLAGS (pat) = flags;

+ if (increasing)
+   PUT_MODE (body, CASE_VECTOR_SHORTEN_MODE (0, 0, body));
}
}
#endif /* CASE_VECTOR_SHORTEN_MODE */

  continue;
}
#endif /* CASE_VECTOR_SHORTEN_MODE */




The main shortening if statement should then be conditional on:

#ifdef CASE_VECTOR_SHORTEN_MODE
  if (increasing
  && JUMP_P (insn)
  && GET_CODE (PATTERN (insn)) == ADDR_DIFF_VEC)


As explained above, CASE_VECTOR_SHORTEN_MODE can make in the
decreasing/non-iterating branch shortening mode.

so I think the code for changing the mode should be
   if (!increasing
  || (GET_MODE_SIZE (vec_mode)
  >= GET_MODE_SIZE (GET_MODE (body
PUT_MODE (body, vec_mode);


Re: Ping: RFA: add lock_length attribute to break branch-shortening cycles

2012-10-20 Thread Richard Sandiford
Joern Rennecke  writes:
> @@ -1165,6 +1175,7 @@ shorten_branches (rtx first ATTRIBUTE_UN
>   get the current insn length.  If it has changed, reflect the change.
>   When nothing changes for a full pass, we are done.  */
>  
> +  bool first_pass ATTRIBUTE_UNUSED = true;
>while (something_changed)
>  {
>something_changed = 0;
> @@ -1220,6 +1231,7 @@ shorten_branches (rtx first ATTRIBUTE_UN
> rtx prev;
> int rel_align = 0;
> addr_diff_vec_flags flags;
> +   enum machine_mode vec_mode;
>  
> /* Avoid automatic aggregate initialization.  */
> flags = ADDR_DIFF_VEC_FLAGS (body);
> @@ -1298,9 +1310,12 @@ shorten_branches (rtx first ATTRIBUTE_UN
> else
>   max_addr += align_fuzz (max_lab, rel_lab, 0, 0);
>   }
> -   PUT_MODE (body, CASE_VECTOR_SHORTEN_MODE (min_addr - rel_addr,
> - max_addr - rel_addr,
> - body));
> +   vec_mode = CASE_VECTOR_SHORTEN_MODE (min_addr - rel_addr,
> +max_addr - rel_addr, body);
> +   if (first_pass
> +   || (GET_MODE_SIZE (vec_mode)
> +   >= GET_MODE_SIZE (GET_MODE (body
> + PUT_MODE (body, vec_mode);
> if (JUMP_TABLES_IN_TEXT_SECTION
> || readonly_data_section == text_section)
>   {

I think instead the set-up loop should have:

  if (GET_CODE (body) == ADDR_VEC || GET_CODE (body) == ADDR_DIFF_VEC)
{
#ifdef CASE_VECTOR_SHORTEN_MODE
  if (increasing && GET_CODE (body) == ADDR_DIFF_VEC)
PUT_MODE (body, CASE_VECTOR_SHORTEN_MODE (0, 0, body));
#endif
  /* This only takes room if read-only data goes into the text
 section.  */
  if (JUMP_TABLES_IN_TEXT_SECTION
  || readonly_data_section == text_section)
insn_lengths[uid] = (XVECLEN (body,
  GET_CODE (body) == ADDR_DIFF_VEC)
 * GET_MODE_SIZE (GET_MODE (body)));
  /* Alignment is handled by ADDR_VEC_ALIGN.  */
}

(with just the CASE_VECTOR_SHORTEN_MODE part being new).
We then start with the most optimistic length possible,
as with everything else.

The main shortening if statement should then be conditional on:

#ifdef CASE_VECTOR_SHORTEN_MODE
  if (increasing
  && JUMP_P (insn)
  && GET_CODE (PATTERN (insn)) == ADDR_DIFF_VEC)
...

(testing "increasing" rather than "optimize").  The code for changing
the mode should simply be:

  if (GET_MODE_SIZE (vec_mode)
  >= GET_MODE_SIZE (GET_MODE (body
PUT_MODE (body, vec_mode);

with first_pass no longer being necessary.

OK with that change, if you agree.

Richard


Re: Ping: RFA: add lock_length attribute to break branch-shortening cycles

2012-10-19 Thread Joern Rennecke

Quoting Richard Sandiford :


Joern Rennecke  writes:



When the condition is not fulfilled, we want to keep the length from the
previous iteration.


Right, that's what I mean.  So we need to make sure that the difference
between the address of the current instruction and the address of the
next instruction also doesn't change.  The code as posted was:

  else
{
  new_length = insn_current_length (insn);
  insn_current_address += new_length;
}

#ifdef ADJUST_INSN_LENGTH
  /* If needed, do any adjustment.  */
  tmp_length = new_length;
  ADJUST_INSN_LENGTH (insn, new_length);
  insn_current_address += (new_length - tmp_length);
#endif

  if (new_length != insn_lengths[uid]
  && (new_length > insn_lengths[uid] || first_pass))
{
  insn_lengths[uid] = new_length;
  something_changed = 1;
}

which always leaves insn_current_address based off new_length,
even if new_length is out of kilter with insn_lengths[uid].


Indeed.

I have attached a patch which I believe addresses the issues you raised.

Because making the length sticky is problematic when UIDs aren't unique,
I have tested this patch on top of that patch:

http://gcc.gnu.org/ml/gcc-patches/2012-10/msg01806.html

regression tested on i686-pc-linux-gnu X cris-elf and
i686-pc-linux-gnu X mipsel-elf.
2012-10-19  J"orn Rennecke  

* final.c (shorten_branches): When optimizing, start with small
length and increase from there, and don't decrease lengths.

Index: final.c
===
--- final.c (revision 192573)
+++ final.c (working copy)
@@ -1066,7 +1066,15 @@ shorten_branches (rtx first ATTRIBUTE_UN
 }
 #endif /* CASE_VECTOR_SHORTEN_MODE */
 
+  /* When optimizing, we start assuming minimum length, and keep increasing
+ lengths as we find the need for this, till nothing changes.
+ When not optimizing, we start assuming maximum lengths, and
+ do a single pass to update the lengths.  */
+  bool increasing = optimize != 0;
+
   /* Compute initial lengths, addresses, and varying flags for each insn.  */
+  int (*length_fun) (rtx) = increasing ? insn_min_length : insn_default_length;
+
   for (insn_current_address = 0, insn = first;
insn != 0;
insn_current_address += insn_lengths[uid], insn = NEXT_INSN (insn))
@@ -1117,6 +1125,8 @@ shorten_branches (rtx first ATTRIBUTE_UN
 #else
  const_delay_slots = 0;
 #endif
+ int (*inner_length_fun) (rtx)
+   = const_delay_slots ? length_fun : insn_default_length;
  /* Inside a delay slot sequence, we do not do any branch shortening
 if the shortening could change the number of delay slots
 of the branch.  */
@@ -1131,7 +1141,7 @@ shorten_branches (rtx first ATTRIBUTE_UN
inner_length = (asm_insn_count (PATTERN (inner_insn))
* insn_default_length (inner_insn));
  else
-   inner_length = insn_default_length (inner_insn);
+   inner_length = inner_length_fun (inner_insn);
 
  insn_lengths[inner_uid] = inner_length;
  if (const_delay_slots)
@@ -1149,7 +1159,7 @@ shorten_branches (rtx first ATTRIBUTE_UN
}
   else if (GET_CODE (body) != USE && GET_CODE (body) != CLOBBER)
{
- insn_lengths[uid] = insn_default_length (insn);
+ insn_lengths[uid] = length_fun (insn);
  varying_length[uid] = insn_variable_length_p (insn);
}
 
@@ -1165,6 +1175,7 @@ shorten_branches (rtx first ATTRIBUTE_UN
  get the current insn length.  If it has changed, reflect the change.
  When nothing changes for a full pass, we are done.  */
 
+  bool first_pass ATTRIBUTE_UNUSED = true;
   while (something_changed)
 {
   something_changed = 0;
@@ -1220,6 +1231,7 @@ shorten_branches (rtx first ATTRIBUTE_UN
  rtx prev;
  int rel_align = 0;
  addr_diff_vec_flags flags;
+ enum machine_mode vec_mode;
 
  /* Avoid automatic aggregate initialization.  */
  flags = ADDR_DIFF_VEC_FLAGS (body);
@@ -1298,9 +1310,12 @@ shorten_branches (rtx first ATTRIBUTE_UN
  else
max_addr += align_fuzz (max_lab, rel_lab, 0, 0);
}
- PUT_MODE (body, CASE_VECTOR_SHORTEN_MODE (min_addr - rel_addr,
-   max_addr - rel_addr,
-   body));
+ vec_mode = CASE_VECTOR_SHORTEN_MODE (min_addr - rel_addr,
+  max_addr - rel_addr, body);
+ if (first_pass
+ || (GET_MODE_SIZE (vec_mode)
+ >= GET_MODE_SIZE (GET_MODE (body
+   PUT_MODE (body, vec_mode);

Re: Ping: RFA: add lock_length attribute to break branch-shortening cycles

2012-10-18 Thread Richard Sandiford
Joern Rennecke  writes:
> Quoting Richard Sandiford :
>> The fact that we even have shared unique ids is pretty bad -- and surely
>> a contradiction in terms -- but I think both ways of handling them rely
>> on the length being the same for all copies.  If we don't record a length
>> (your version), we won't set something_changed if the length of this copy
>> did in fact change, which could lead to branches remaining too short.
>
> That is not the case, as the current length of the inner insn is added to
> new_length (of the outer insn).
>
>> If we do record a length (current version),
>
> In the current version, the length of the inner insn is calculated anew
> each iteration, so only the out-of-sequence copy suffers from the bogosity.
>
>> we could end up changing
>> the length of previous copies that have already been processed in
>> this iteration.
>
> Both that, and also using the length or the previous insn for the inner
> length calculation, and ratcheting them up together.
> In fact, this can make delay slot instructions ineligible for the delay slot
> they are in.  Also, the unwanted length cross-interference can violate
> alignment invariants, and this leads to invalid code on ARCompact.

But if you're saying that ARCompat wants different copies of the same insn
(with the same uid) to have different lengths, then please fix dbr_schedule
so that it uses different uids.  The "i == 0" thing is just too hackish IMO.

>> is simpler and means we get the natural behaviour if anyone ever does
>> "fix" dbr_schedule.
>>
>> I think first_pass should be !optimize.
>
> That would not work for CASE_VECTOR_SHORTEN_MODE, where it controls setting
> the mode of BODY.
>
> And as we have to use the first_pass variable there, I thought it's simpler
> to also use it in these other places; if we ever want to switch the
> decision on when to use increasing and when to use decreasing lengths,
> the algorithm continues to work this way, without the need to replace the
> optimize test in all places with the new test.

I don't mind having a local boolean to control whether we're applying
increasing or decreasing lengths, if you prefer that to plain "optimize".
I just don't like the condition changing from the second approximation
to the third.

>>> @@ -1382,7 +1399,8 @@ shorten_branches (rtx first ATTRIBUTE_UN
>>>   insn_current_address += (new_length - tmp_length);
>>>  #endif
>>>
>>> - if (new_length != insn_lengths[uid])
>>> + if (new_length != insn_lengths[uid]
>>> + && (new_length > insn_lengths[uid] || first_pass))
>>> {
>>>   insn_lengths[uid] = new_length;
>>>   something_changed = 1;
>>
>> Same first_pass comment here.  I.e.:
>>
>>if (optimize
>>? new_length > insn_lengths[uid]
>>: new_length < insn_lengths[uid])
>
> It's about testing a condition first that can eliminate further tests.
> if the length is unchanged - which would be expected to be the case
> for most variable length instructions after the first iteration -
> we don't need to test the other conditions.
> And optimize, being a global variable, is relatively expensive on
> some host platforms.
> It could get even more expensive if/when we ever go multithreaded.
>
>> I think you need an "else" that does:
>>
>> insn_current_address += insn_lengths[uid] - new_length;
>
> When the condition is not fulfilled, we want to keep the length from the
> previous iteration.

Right, that's what I mean.  So we need to make sure that the difference
between the address of the current instruction and the address of the
next instruction also doesn't change.  The code as posted was:

  else
{
  new_length = insn_current_length (insn);
  insn_current_address += new_length;
}

#ifdef ADJUST_INSN_LENGTH
  /* If needed, do any adjustment.  */
  tmp_length = new_length;
  ADJUST_INSN_LENGTH (insn, new_length);
  insn_current_address += (new_length - tmp_length);
#endif

  if (new_length != insn_lengths[uid]
  && (new_length > insn_lengths[uid] || first_pass))
{
  insn_lengths[uid] = new_length;
  something_changed = 1;
}

which always leaves insn_current_address based off new_length,
even if new_length is out of kilter with insn_lengths[uid].
It looked like it should be:

  else
{
  new_length = insn_current_length (insn);
  insn_current_address += new_length;
}

#ifdef ADJUST_INSN_LENGTH
  /* If needed, do any adjustment.  */
  tmp_length = new_length;
  ADJUST_INSN_LENGTH (insn, new_length);
  insn_current_address += (new_length - tmp_length);
#endif

  if (...honour new_length...)
{
  insn_lengths[uid] = new_length;
  something_changed = 1;
}
  else
insn_current_address += insn_le

Re: Ping: RFA: add lock_length attribute to break branch-shortening cycles

2012-10-18 Thread Joern Rennecke

Quoting Richard Sandiford :


The fact that we even have shared unique ids is pretty bad -- and surely
a contradiction in terms -- but I think both ways of handling them rely
on the length being the same for all copies.  If we don't record a length
(your version), we won't set something_changed if the length of this copy
did in fact change, which could lead to branches remaining too short.


That is not the case, as the current length of the inner insn is added to
new_length (of the outer insn).


If we do record a length (current version),


In the current version, the length of the inner insn is calculated anew
each iteration, so only the out-of-sequence copy suffers from the bogosity.


we could end up changing
the length of previous copies that have already been processed in
this iteration.


Both that, and also using the length or the previous insn for the inner
length calculation, and ratcheting them up together.
In fact, this can make delay slot instructions ineligible for the delay slot
they are in.  Also, the unwanted length cross-interference can violate
alignment invariants, and this leads to invalid code on ARCompact.

  Both would be wrong, but at least the current version

is simpler and means we get the natural behaviour if anyone ever does
"fix" dbr_schedule.

I think first_pass should be !optimize.


That would not work for CASE_VECTOR_SHORTEN_MODE, where it controls setting
the mode of BODY.

And as we have to use the first_pass variable there, I thought it's simpler
to also use it in these other places; if we ever want to switch the
decision on when to use increasing and when to use decreasing lengths,
the algorithm continues to work this way, without the need to replace the
optimize test in all places with the new test.


@@ -1382,7 +1399,8 @@ shorten_branches (rtx first ATTRIBUTE_UN
  insn_current_address += (new_length - tmp_length);
 #endif

- if (new_length != insn_lengths[uid])
+ if (new_length != insn_lengths[uid]
+ && (new_length > insn_lengths[uid] || first_pass))
{
  insn_lengths[uid] = new_length;
  something_changed = 1;


Same first_pass comment here.  I.e.:

  if (optimize
  ? new_length > insn_lengths[uid]
  : new_length < insn_lengths[uid])


It's about testing a condition first that can eliminate further tests.
if the length is unchanged - which would be expected to be the case
for most variable length instructions after the first iteration -
we don't need to test the other conditions.
And optimize, being a global variable, is relatively expensive on
some host platforms.
It could get even more expensive if/when we ever go multithreaded.


I think you need an "else" that does:

insn_current_address += insn_lengths[uid] - new_length;


When the condition is not fulfilled, we want to keep the length from the
previous iteration.  We could put
"new_length = insn_length[uid];" in the else clause, but that'd be pointless,
as new_length is dead at that point.


Re: Ping: RFA: add lock_length attribute to break branch-shortening cycles

2012-10-18 Thread Richard Sandiford
Joern Rennecke  writes:
> @@ -1360,12 +1369,20 @@ shorten_branches (rtx first ATTRIBUTE_UN
> else
>   inner_length = insn_current_length (inner_insn);
>  
> -   if (inner_length != insn_lengths[inner_uid])
> +   /* We can't record lengths of delay slot insns, because
> +  they might just be a copy of the insn at the branch
> +  target, sharing its uid.  */
> +   if (i == 0 && inner_length != insn_lengths[inner_uid])
>   {
> -   insn_lengths[inner_uid] = inner_length;
> -   something_changed = 1;
> +   if (inner_length > insn_lengths[inner_uid] || first_pass)
> + {
> +   insn_lengths[inner_uid] = inner_length;
> +   something_changed = 1;
> + }
> +   else
> + inner_length = insn_lengths[inner_uid];
>   }
> -   insn_current_address += insn_lengths[inner_uid];
> +   insn_current_address += inner_length;
> new_length += inner_length;
>   }
>   }

The fact that we even have shared unique ids is pretty bad -- and surely
a contradiction in terms -- but I think both ways of handling them rely
on the length being the same for all copies.  If we don't record a length
(your version), we won't set something_changed if the length of this copy
did in fact change, which could lead to branches remaining too short.
If we do record a length (current version), we could end up changing
the length of previous copies that have already been processed in
this iteration.  Both would be wrong, but at least the current version
is simpler and means we get the natural behaviour if anyone ever does
"fix" dbr_schedule.

I think first_pass should be !optimize.  The set-up loop chooses minimum
lengths when optimising, so it should be correct to keep the maximum on
all iterations of this loop.  I.e.

  if (optimize
  ? inner_length > insn_lengths[inner_uid]
  : inner_length < insn_lengths[inner_uid])
{
  insn_lengths[inner_uid] = inner_length;
  something_changed = 1;
}
  else
inner_length = insn_lengths[inner_uid];

> @@ -1382,7 +1399,8 @@ shorten_branches (rtx first ATTRIBUTE_UN
> insn_current_address += (new_length - tmp_length);
>  #endif
>  
> -   if (new_length != insn_lengths[uid])
> +   if (new_length != insn_lengths[uid]
> +   && (new_length > insn_lengths[uid] || first_pass))
>   {
> insn_lengths[uid] = new_length;
> something_changed = 1;

Same first_pass comment here.  I.e.:

  if (optimize
  ? new_length > insn_lengths[uid]
  : new_length < insn_lengths[uid])

I think you need an "else" that does:

insn_current_address += insn_lengths[uid] - new_length;

Richard


Re: Ping: RFA: add lock_length attribute to break branch-shortening cycles

2012-10-17 Thread Richard Biener
On Tue, Oct 16, 2012 at 9:35 PM, Joern Rennecke
 wrote:
> Quoting Richard Sandiford :
>
>> Joern Rennecke  writes:
>>>
>>> 2012-10-04  Joern Rennecke  
>>>
>>>  * final.c (get_attr_length_1): Use direct recursion rather than
>>>  calling get_attr_length.
>>>  (get_attr_lock_length): New function.
>>>  (INSN_VARIABLE_LENGTH_P): Define.
>>>  (shorten_branches): Take HAVE_ATTR_lock_length into account.
>>>  Don't overwrite non-delay slot insn lengths with the lengths of
>>>  delay slot insns with same uid.
>>>  * genattrtab.c (lock_length_str): New variable.
>>>  (make_length_attrs): New parameter base.
>>>  (main): Initialize lock_length_str.
>>>  Generate lock_lengths attributes.
>>>  * genattr.c (gen_attr): Emit declarations for lock_length
>>> attribute
>>> related functions.
>>> * doc/md.texi (node Insn Lengths): Document lock_length
>>> attribute.
>>>
>>> http://gcc.gnu.org/ml/gcc-patches/2012-10/msg00383.html
>>
>>
>> Sorry, this is really just repeating Richard B's comments, but I still
>> don't understand why we need two attributes.  Why can't shorten_branches
>> work with the existing length and simply make sure that the length doesn't
>> decrease from one iteration to the next?  That seems to be how you
>> implement
>> CASE_VECTOR_SHORTEN_MODE.  It also means that we can continue to use the
>> pessimistic algorithm for -O0.
>
>
> That does not really work for ARCompact, non-branch instructions are
> lengthened to align or un-align branch instructions.  Failure to
> correct the size of such instruction downwards when indicated messes
> not only up the directly affected branch, but also gets the alignment
> wrong downstream, and can even cause branches go out of range (blindsiding
> branch shortening) when there are interactions with explicit alignments
> for things like loops.
> Unless you think it's OK to poke the contents of the insn_length array from
> ADJUST_INSN_LENGTH.  That should work, but it'd require an extra parameter
> when you want to hookize this macro.
>
>
>> You said in your reply that one of the reasons was to avoid
>> "interesting" interactions with ADJUST_INSN_LENGTH.  But the
>> previous minimum length would be applied after ADJUST_INSN_LENGTH,
>> so I'm not sure why it's a factor.
>
>
> Well, for starters, the ARCompact ADJUST_INSN_LENGTH uses
> get_attr_lock_length when processing delayed-branch SEQUENCEs.
> And then there's the short/long instruction opcode distinction (function /
> attribute verify_short), which depends on alignment, and influences
> instruction length and conditional execution calculation.
> Without exact alignment information, branch scheduling becomes a crapshot.
>
> You might think I could subsume the length effect of the upsized
> instructions
> in the affected branch, but that doesn't work because the exact length is
> needed both for ADJUST_INSN_LENGTH, and in order to output the correct
> branch / jump output template.
>
>
>
>> If lock_length is just an optimisation on top of that, then maybe
>> it would help to split it out.
>
>
> Well, to the extent that branch shortening and scheduling are just
> optimizations, having the lock_length attribute is just an optimization.
>
> Well, we could split it anyway, and give ports without the need for
> multiple length attributes the benefit of the optimistic algorithm.
>
> I have attached a patch that implements this.

Looks reasonable to me, though I'm not familiar enough with the code
to approve it.

I'd strongly suggest to try harder to make things work for you without
the new attribute even though I wasn't really able to follow your reasoning
on why that wouldn't work.  It may be easier to motivate this change
once the port is in without that attribute so one can actually look at
the machine description and port details.

Richard.

> I first meant to test it for sh-elf, alas, PR54938 currently makes this
> impossible.
> So I used arm-eabi instead.  regression tests look OK, but libgcc.a and
> libc.a (newlib) are unchanged in size, while libdtfc++.a increaes by twelve
> bytes; more specifically, the size increase happens for the object file
> debug.o, which goes from 11028 to 11040 bytes.
> Likewise, cris-elf and mipsel-elf show a small increase in size of debug.o,
> with the rest of libstdc++.a unchanged in size.
>
> Generating and comparing the arm intermediate files debug.s shows a
> suspicious
> incidence of pathnames.  Using a longer build directory pathname makes no
> difference, though.
> OTOH, using a longer source directory pathname does.  So that leaves...
> makes no difference for building these libraries.
> It should stop endless looping in shorten_branches, though.  Albeit that
> is only releant for ports that do have some negative shorten_branch
> feedback.


Re: Ping: RFA: add lock_length attribute to break branch-shortening cycles

2012-10-16 Thread Joern Rennecke

Quoting Richard Sandiford :


Joern Rennecke  writes:

2012-10-04  Joern Rennecke  

 * final.c (get_attr_length_1): Use direct recursion rather than
 calling get_attr_length.
 (get_attr_lock_length): New function.
 (INSN_VARIABLE_LENGTH_P): Define.
 (shorten_branches): Take HAVE_ATTR_lock_length into account.
 Don't overwrite non-delay slot insn lengths with the lengths of
 delay slot insns with same uid.
 * genattrtab.c (lock_length_str): New variable.
 (make_length_attrs): New parameter base.
 (main): Initialize lock_length_str.
 Generate lock_lengths attributes.
 * genattr.c (gen_attr): Emit declarations for lock_length attribute
related functions.
* doc/md.texi (node Insn Lengths): Document lock_length attribute.

http://gcc.gnu.org/ml/gcc-patches/2012-10/msg00383.html


Sorry, this is really just repeating Richard B's comments, but I still
don't understand why we need two attributes.  Why can't shorten_branches
work with the existing length and simply make sure that the length doesn't
decrease from one iteration to the next?  That seems to be how you implement
CASE_VECTOR_SHORTEN_MODE.  It also means that we can continue to use the
pessimistic algorithm for -O0.


That does not really work for ARCompact, non-branch instructions are
lengthened to align or un-align branch instructions.  Failure to
correct the size of such instruction downwards when indicated messes
not only up the directly affected branch, but also gets the alignment
wrong downstream, and can even cause branches go out of range  
(blindsiding branch shortening) when there are interactions with  
explicit alignments

for things like loops.
Unless you think it's OK to poke the contents of the insn_length array from
ADJUST_INSN_LENGTH.  That should work, but it'd require an extra parameter
when you want to hookize this macro.


You said in your reply that one of the reasons was to avoid
"interesting" interactions with ADJUST_INSN_LENGTH.  But the
previous minimum length would be applied after ADJUST_INSN_LENGTH,
so I'm not sure why it's a factor.


Well, for starters, the ARCompact ADJUST_INSN_LENGTH uses
get_attr_lock_length when processing delayed-branch SEQUENCEs.
And then there's the short/long instruction opcode distinction (function /
attribute verify_short), which depends on alignment, and influences
instruction length and conditional execution calculation.
Without exact alignment information, branch scheduling becomes a crapshot.

You might think I could subsume the length effect of the upsized instructions
in the affected branch, but that doesn't work because the exact length is
needed both for ADJUST_INSN_LENGTH, and in order to output the correct
branch / jump output template.



If lock_length is just an optimisation on top of that, then maybe
it would help to split it out.


Well, to the extent that branch shortening and scheduling are just
optimizations, having the lock_length attribute is just an optimization.

Well, we could split it anyway, and give ports without the need for
multiple length attributes the benefit of the optimistic algorithm.

I have attached a patch that implements this.

I first meant to test it for sh-elf, alas, PR54938 currently makes this
impossible.
So I used arm-eabi instead.  regression tests look OK, but libgcc.a and
libc.a (newlib) are unchanged in size, while libdtfc++.a increaes by twelve
bytes; more specifically, the size increase happens for the object file
debug.o, which goes from 11028 to 11040 bytes.
Likewise, cris-elf and mipsel-elf show a small increase in size of debug.o,
with the rest of libstdc++.a unchanged in size.

Generating and comparing the arm intermediate files debug.s shows a suspicious
incidence of pathnames.  Using a longer build directory pathname makes no
difference, though.
OTOH, using a longer source directory pathname does.  So that leaves...
makes no difference for building these libraries.
It should stop endless looping in shorten_branches, though.  Albeit that
is only releant for ports that do have some negative shorten_branch feedback.
2012-10-16  J"orn Rennecke  

* final.c (shorten_branches): When optimizing, start with small
length and increase from there, and don't decrease lengths.
Don't overwrite non-delay slot insn lengths with the lengths of
delay slot insns with same uid.

Index: final.c
===
--- final.c (revision 192491)
+++ final.c (working copy)
@@ -1067,6 +1067,8 @@ shorten_branches (rtx first ATTRIBUTE_UN
 #endif /* CASE_VECTOR_SHORTEN_MODE */
 
   /* Compute initial lengths, addresses, and varying flags for each insn.  */
+  int (*length_fun) (rtx) = optimize ? insn_min_length : insn_default_length;
+
   for (insn_current_address = 0, insn = first;
insn != 0;
insn_current_address += insn_lengths[uid], insn = NEXT_INSN (in

Re: Ping: RFA: add lock_length attribute to break branch-shortening cycles

2012-10-15 Thread Richard Sandiford
Joern Rennecke  writes:
> 2012-10-04  Joern Rennecke  
>
>  * final.c (get_attr_length_1): Use direct recursion rather than
>  calling get_attr_length.
>  (get_attr_lock_length): New function.
>  (INSN_VARIABLE_LENGTH_P): Define.
>  (shorten_branches): Take HAVE_ATTR_lock_length into account.
>  Don't overwrite non-delay slot insn lengths with the lengths of
>  delay slot insns with same uid.
>  * genattrtab.c (lock_length_str): New variable.
>  (make_length_attrs): New parameter base.
>  (main): Initialize lock_length_str.
>  Generate lock_lengths attributes.
>  * genattr.c (gen_attr): Emit declarations for lock_length attribute
>   related functions.
>   * doc/md.texi (node Insn Lengths): Document lock_length attribute.
>
> http://gcc.gnu.org/ml/gcc-patches/2012-10/msg00383.html

Sorry, this is really just repeating Richard B's comments, but I still
don't understand why we need two attributes.  Why can't shorten_branches
work with the existing length and simply make sure that the length doesn't
decrease from one iteration to the next?  That seems to be how you implement
CASE_VECTOR_SHORTEN_MODE.  It also means that we can continue to use the
pessimistic algorithm for -O0.

You said in your reply that one of the reasons was to avoid
"interesting" interactions with ADJUST_INSN_LENGTH.  But the
previous minimum length would be applied after ADJUST_INSN_LENGTH,
so I'm not sure why it's a factor.

If lock_length is just an optimisation on top of that, then maybe
it would help to split it out.

Richard


Ping: RFA: add lock_length attribute to break branch-shortening cycles

2012-10-14 Thread Joern Rennecke

2012-10-04  Joern Rennecke  

* final.c (get_attr_length_1): Use direct recursion rather than
calling get_attr_length.
(get_attr_lock_length): New function.
(INSN_VARIABLE_LENGTH_P): Define.
(shorten_branches): Take HAVE_ATTR_lock_length into account.
Don't overwrite non-delay slot insn lengths with the lengths of
delay slot insns with same uid.
* genattrtab.c (lock_length_str): New variable.
(make_length_attrs): New parameter base.
(main): Initialize lock_length_str.
Generate lock_lengths attributes.
* genattr.c (gen_attr): Emit declarations for lock_length attribute
related functions.
* doc/md.texi (node Insn Lengths): Document lock_length attribute.

http://gcc.gnu.org/ml/gcc-patches/2012-10/msg00383.html


Re: RFA: add lock_length attribute to break branch-shortening cycles

2012-10-04 Thread Joern Rennecke

Quoting Richard Guenther :


I miss a few things in this description:
- what is the value of lock_length supposed to be?  From the "lock"
  prefix it sounds like it is something unchanging, maybe even constant,
  thus a maximum?
- the length attribute still needs to be specified when lock_length is?
  how do they relate?  Is lock_length always smaller / bigger than length?


I hope I have clarified this in the updated documentation:

 Usually, branch shortening is done assuming the worst case (i.e.
longest) lengths, and then iterating (if optimizing) to smaller lengths
till no further changed occur.  This does not work so well for
architectures that have very small minimum offsets and considerable
jumps in instruction lengths.

 If you define the `lock_length' attribute, branch shortening will work
the other way round: it starts out assuming minimum instruction lengths
and iterates from there.  `lock_length' specifies an instruction length
value that is calculated like `length' in every iteration, but if the
value at the last iteration was larger, that larger previous value will
be used instead.  The value computed for the `length' attribute will be
no smaller than that of the `lock_length' attribute, but you may still
specify a larger value, and in that case `length' can decrease in the
next iteration, but not below the value of `lock_length'.  The easiest
way to make sure branch shortening doesn't start looping indefinitely
is to define the `length' attribute to only a minimum instruction
length for varying length instructions and let the `lock_length'
attribute do all the heavy lifting of computing varying lengths.  On
the other hand, for some instruction sets you might get better
shortening outcomes if you use a more complex `length' attribute and
use `lock_length' only to the extent required to prevent indefinite
looping.  Note that `ADJUST_INSN_LENGTH' applies only to the `length'
attribute.

 Because defining `lock_length' makes branch shortening start with
optimistic assumptions, this means we have to see it through to the end
to eliminate all out-of-range branches, thus branch shortening might
take a bit longer at `-O0' than if you didn't define the `lock_length'
attribute.

 Using `lock_length' with varying delay slots and varying length delay
slot insns (all at once) is currently not supported.


- what happens if you have patterns with lock_length and patterns without?
- what patterns does lock_length apply to?


These questions don't really make literal sense; instruction attributes
are defined for all instructions.  Of course, you can define the default
value to 0, which means you see no effect of this attribute on a pattern
unless some non-default definition applies.


In general optimistically attacking this kind of problem should be always
better - did you try simply switching this for all targets?


No, I haven't.  The boundaries to be set for branch offsets can be different
when starting with optimistic assumptions than when starting with  
pessimistic assumptions.  Moreover, there could be 'interesting'  
interactions

with ADJUST_INSN_LENGTH.  And there is the problem with handling varying
delay slots.  I have modified the final.c patch to reduce the scope of this
issue so that lock_length should hopefully be useful for a wider range of
targets.

Also, ...


 It shouldn't be
slower and the only thing you need to guarantee is that during iteration
you never make insn-lenghts smaller again.


At -O0 it is slower because we can't finish the loop early.

For all these reasons, I think it is better if each target maintainer  
evaluates

if the better branch shortening weighs out the longer -O0 compilation time,
and addresses any issues arising if/when converting.

Bootstrapped on i686-pc-linux-gnu
Index: doc/md.texi
===
--- doc/md.texi (revision 192036)
+++ doc/md.texi (working copy)
@@ -8004,6 +8004,42 @@ (define_insn "jump"
   (const_int 6)))])
 @end smallexample
 
+@cindex lock_length
+Usually, branch shortening is done assuming the worst case (i.e. longest)
+lengths, and then iterating (if optimizing) to smaller lengths till 
+no further changed occur.  This does not work so well for architectures
+that have very small minimum offsets and considerable jumps in instruction
+lengths.
+
+If you define the @code{lock_length} attribute, branch shortening will
+work the other way round: it starts out assuming minimum instruction
+lengths and iterates from there.  @code{lock_length} specifies an instruction
+length value that is calculated like @code{length} in every iteration,
+but if the value at the last iteration was larger, that larger previous
+value will be used instead.
+The value computed for the @code{length} attribute will be no smaller
+than that of the @code{lock_length} attribute, but you may still specify
+a larger value, and in that case @code{length} can decrease in the next
+iteration, but no

Re: RFA: add lock_length attribute to break branch-shortening cycles

2012-10-04 Thread Richard Guenther
On Wed, Oct 3, 2012 at 8:22 PM, Joern Rennecke
 wrote:
> The ARCompact architecture has some pipelining features that result in
> the vanilla branch shortening not always converging.
>
> Moreover, there are some short, complex branch instructions with very short
> offsets; replacing them with a multi-insn sequence when the offset doesn't
> reach makes the code significantly longer.  Thus, when starting branch
> shortening with pessimistic assumptions, the short branches are often not
> used because of the pessimistic branch length causing the offsets going out
> of range.
> This problem can be avoided when starting with a low estimate and working
> upwards.  However, that makes the incidence of infinite branch shortening
> cycles higher, and also makes it impossible to just break out after some
> iteration count.
>
> To address these issues, I've made the generator programs recognize the
> optional lock_length attribute.
>
> To quote from the documentation added for this feature:
>
>  If you define the `lock_length' attribute, branch shortening will work
> the other way round: it starts out assuming minimum instruction lengths
> and iterates from there.  In addition, the value of the `lock_length'
> attribute does not decrease across iterations, and the value computed
> for the `length' attribute will be no smaller than that of the
> `lock_length' attribute.

I miss a few things in this description:
- what is the value of lock_length supposed to be?  From the "lock"
  prefix it sounds like it is something unchanging, maybe even constant,
  thus a maximum?
- the length attribute still needs to be specified when lock_length is?
  how do they relate?  Is lock_length always smaller / bigger than length?
- what happens if you have patterns with lock_length and patterns without?
- what patterns does lock_length apply to?

In general optimistically attacking this kind of problem should be always
better - did you try simply switching this for all targets?  It shouldn't be
slower and the only thing you need to guarantee is that during iteration
you never make insn-lenghts smaller again.

Richard.

> bootstrapped and regression tested on i686-pc-linux-gnu
>
> 2012-10-03  Joern Rennecke  
>
> * final.c (get_attr_length_1): Use direct recursion rather than
> calling get_attr_length.
> (get_attr_lock_length): New function.
> (INSN_VARIABLE_LENGTH_P): Define.
> (shorten_branches): Take HAVE_ATTR_lock_length into account.
> Don't overwrite non-delay slot insn lengths with the lengths of
> delay slot insns with same uid.
> * genattrtab.c (lock_length_str): New variable.
> (make_length_attrs): New parameter base.
> (main): Initialize lock_length_str.
> Generate lock_lengths attributes.
> * genattr.c (gen_attr): Emit declarations for lock_length attribute
> related functions.
> * doc/md.texi (node Insn Lengths): Document lock_length attribute.
>
> Index: doc/md.texi
> ===
> --- doc/md.texi (revision 192036)
> +++ doc/md.texi (working copy)
> @@ -8004,6 +8004,20 @@ (define_insn "jump"
>(const_int 6)))])
>  @end smallexample
>
> +@cindex lock_length
> +Usually, branch shortening is done assuming the worst case (i.e. longest)
> +lengths, and then iterating (if optimizing) to smaller lengths till
> +no further changed occur.  This does not work so well for architectures
> +that have very small minimum offsets and considerable jumps in instruction
> +lengths.
> +
> +If you define the @code{lock_length} attribute, branch shortening will
> +work the other way round: it starts out assuming minimum instruction
> +lengths and iterates from there.  In addition, the value of the
> +@code{lock_length} attribute does not decrease across iterations, and
> +the value computed for the @code{length} attribute will be no smaller
> +than that of the @code{lock_length} attribute.
> +
>  @end ifset
>  @ifset INTERNALS
>  @node Constant Attributes
> Index: final.c
> ===
> --- final.c (revision 192036)
> +++ final.c (working copy)
> @@ -312,6 +312,7 @@ dbr_sequence_length (void)
> `insn_current_length'.  */
>
>  static int *insn_lengths;
> +static char *uid_lock_length;
>
>  VEC(int,heap) *insn_addresses_;
>
> @@ -447,6 +448,20 @@ get_attr_length (rtx insn)
>return get_attr_length_1 (insn, insn_default_length);
>  }
>
> +#ifdef HAVE_ATTR_lock_length
> +int
> +get_attr_lock_length (rtx insn)
> +{
> +  if (uid_lock_length && insn_lengths_max_uid > INSN_UID (insn))
> +return uid_lock_length[INSN_UID (insn)];
> +  return get_attr_length_1 (insn, insn_min_lock_length);
> +}
> +#define INSN_VARIABLE_LENGTH_P(INSN) \
> +  (insn_variable_length_p (INSN) || insn_variable_lock_length_p (INSN))
> +#else
> +#define INSN_VARIABLE_LENGTH_P(INSN) (insn_variable_length_p (INSN))
> +#e

RFA: add lock_length attribute to break branch-shortening cycles

2012-10-03 Thread Joern Rennecke

The ARCompact architecture has some pipelining features that result in
the vanilla branch shortening not always converging.

Moreover, there are some short, complex branch instructions with very short
offsets; replacing them with a multi-insn sequence when the offset doesn't
reach makes the code significantly longer.  Thus, when starting branch
shortening with pessimistic assumptions, the short branches are often not
used because of the pessimistic branch length causing the offsets going out
of range.
This problem can be avoided when starting with a low estimate and working
upwards.  However, that makes the incidence of infinite branch shortening
cycles higher, and also makes it impossible to just break out after some
iteration count.

To address these issues, I've made the generator programs recognize the
optional lock_length attribute.

To quote from the documentation added for this feature:

 If you define the `lock_length' attribute, branch shortening will work
the other way round: it starts out assuming minimum instruction lengths
and iterates from there.  In addition, the value of the `lock_length'
attribute does not decrease across iterations, and the value computed
for the `length' attribute will be no smaller than that of the
`lock_length' attribute.

bootstrapped and regression tested on i686-pc-linux-gnu
2012-10-03  Joern Rennecke  

* final.c (get_attr_length_1): Use direct recursion rather than
calling get_attr_length.
(get_attr_lock_length): New function.
(INSN_VARIABLE_LENGTH_P): Define.
(shorten_branches): Take HAVE_ATTR_lock_length into account.
Don't overwrite non-delay slot insn lengths with the lengths of
delay slot insns with same uid.
* genattrtab.c (lock_length_str): New variable.
(make_length_attrs): New parameter base.
(main): Initialize lock_length_str.
Generate lock_lengths attributes.
* genattr.c (gen_attr): Emit declarations for lock_length attribute
related functions.
* doc/md.texi (node Insn Lengths): Document lock_length attribute.

Index: doc/md.texi
===
--- doc/md.texi (revision 192036)
+++ doc/md.texi (working copy)
@@ -8004,6 +8004,20 @@ (define_insn "jump"
   (const_int 6)))])
 @end smallexample
 
+@cindex lock_length
+Usually, branch shortening is done assuming the worst case (i.e. longest)
+lengths, and then iterating (if optimizing) to smaller lengths till 
+no further changed occur.  This does not work so well for architectures
+that have very small minimum offsets and considerable jumps in instruction
+lengths.
+
+If you define the @code{lock_length} attribute, branch shortening will
+work the other way round: it starts out assuming minimum instruction
+lengths and iterates from there.  In addition, the value of the
+@code{lock_length} attribute does not decrease across iterations, and
+the value computed for the @code{length} attribute will be no smaller
+than that of the @code{lock_length} attribute.
+
 @end ifset
 @ifset INTERNALS
 @node Constant Attributes
Index: final.c
===
--- final.c (revision 192036)
+++ final.c (working copy)
@@ -312,6 +312,7 @@ dbr_sequence_length (void)
`insn_current_length'.  */
 
 static int *insn_lengths;
+static char *uid_lock_length;
 
 VEC(int,heap) *insn_addresses_;
 
@@ -447,6 +448,20 @@ get_attr_length (rtx insn)
   return get_attr_length_1 (insn, insn_default_length);
 }
 
+#ifdef HAVE_ATTR_lock_length
+int
+get_attr_lock_length (rtx insn)
+{
+  if (uid_lock_length && insn_lengths_max_uid > INSN_UID (insn))
+return uid_lock_length[INSN_UID (insn)];
+  return get_attr_length_1 (insn, insn_min_lock_length);
+}
+#define INSN_VARIABLE_LENGTH_P(INSN) \
+  (insn_variable_length_p (INSN) || insn_variable_lock_length_p (INSN))
+#else
+#define INSN_VARIABLE_LENGTH_P(INSN) (insn_variable_length_p (INSN))
+#endif
+
 /* Obtain the current length of an insn.  If branch shortening has been done,
get its actual length.  Otherwise, get its minimum length.  */
 int
@@ -865,6 +880,7 @@ shorten_branches (rtx first ATTRIBUTE_UN
   rtx body;
   int uid;
   rtx align_tab[MAX_CODE_ALIGN];
+  int (*length_fun) (rtx);
 
 #endif
 
@@ -875,6 +891,10 @@ shorten_branches (rtx first ATTRIBUTE_UN
   free (uid_shuid);
 
   uid_shuid = XNEWVEC (int, max_uid);
+#ifdef HAVE_ATTR_lock_length
+  uid_lock_length = XNEWVEC (char, max_uid);
+  memset (uid_lock_length, 0, max_uid);
+#endif
 
   if (max_labelno != max_label_num ())
 {
@@ -1067,6 +1087,10 @@ shorten_branches (rtx first ATTRIBUTE_UN
 #endif /* CASE_VECTOR_SHORTEN_MODE */
 
   /* Compute initial lengths, addresses, and varying flags for each insn.  */
+  length_fun = insn_default_length;
+#ifdef HAVE_ATTR_lock_length
+  length_fun = insn_min_length;
+#endif
   for (insn_current_address = 0, insn = first;
insn != 0;