Re: [Patch,AVR]: PR49687 (better widening 32-bit mul)

Denis Chertykov Sun, 31 Jul 2011 07:42:09 -0700

2011/7/28 Georg-Johann Lay <a...@gjlay.de>:
> Richard Henderson wrote:
>> On 07/27/2011 06:21 AM, Georg-Johann Lay wrote:
>>> +(define_insn_and_split "*mulsi3"
>>> +  [(set (match_operand:SI 0 "pseudo_register_operand"                      
>>> "=r")
>>> +        (mult:SI (match_operand:SI 1 "pseudo_register_operand"             
>>>  "r")
>>> +                 (match_operand:SI 2 
>>> "pseudo_register_or_const_int_operand" "rn")))
>>> +   (clobber (reg:DI 18))]
>>> +  "AVR_HAVE_MUL && !reload_completed"
>>> +  { gcc_unreachable(); }
>>> +  "&& 1"
>>> +  [(set (reg:SI 18)
>>> +        (match_dup 1))
>>
>> That seems like it's guaranteed to force an unnecessary move.
>> Have you tried defining special-purpose register classes to
>> force reload to move the data into the right hard regs?
>>
>> E.g.  "Y" prefix
>>       "QHS" size
>>       two digit starting register number, as needed.
>>
>> You'll probably end up with quite a few register classes
>> out of this, but hopefully reload can do a better job than
>> you can manually...
>>
>>
>> r~
>
> Waaaaaahh, I introduced register classes and constraints to tell register 
> allocator
> what's the intention if the insns, the ... parts just dealing with CONST_INTs
> and not needed in the remainder:
>
> (define_expand "mulsi3"
>  [(parallel [(set (match_operand:SI 0 "register_operand" "")
>                   (mult:SI (match_operand:SI 1 "register_operand" "")
>                            (match_operand:SI 2 "nonmemory_operand" "")))
>              (clobber (reg:HI 26))])]
>  "AVR_HAVE_MUL"
>  {
>     ...
>  })
>
> (define_insn_and_split "*mulsi3"
>  [(set (match_operand:SI 0 "register_operand"           "=RS22")
>        (mult:SI (match_operand:SI 1 "register_operand"  "%RS22")
>                 (match_operand:SI 2 "nonmemory_operand"  "RS18")))
>   (clobber (reg:HI 26))]
>  "AVR_HAVE_MUL"
>  "%~call __mulsi3"
>  "&& !reload_completed"
>  [(clobber (const_int 0))]
>  {
>    ...
>    FAIL;
>  }
>  [(set_attr "type" "xcall")
>   (set_attr "cc" "clobber")])
>
>
> Again, I used the simple test case from above:
>
> long mul (long a, long b)
> {
>    return a*b;
> }
>
> long mul2 (long a, long b)
> {
>    return b*a;
> }
>
> Compiled with -Os -mmcu=atmega8 -fno-split-wide-types:
>
> mul:
> /* prologue: function */
>        rcall __mulsi3   ;  7   *mulsi3 [length = 1]
> /* epilogue start */
>        ret      ;  21  return  [length = 1]
>
> mul2:
>        push r8  ;  22  *pushqi/1       [length = 1]
>        push r9  ;  23  *pushqi/1       [length = 1]
>        push r10         ;  24  *pushqi/1       [length = 1]
>        push r11         ;  25  *pushqi/1       [length = 1]
>        push r12         ;  26  *pushqi/1       [length = 1]
>        push r13         ;  27  *pushqi/1       [length = 1]
>        push r14         ;  28  *pushqi/1       [length = 1]
>        push r15         ;  29  *pushqi/1       [length = 1]
>        push r28         ;  30  *pushqi/1       [length = 1]
>        push r29         ;  31  *pushqi/1       [length = 1]
>        rcall .  ;  35  *addhi3_sp_R_pc2        [length = 2]
>        rcall .
>        in r28,__SP_L__  ;  36  *movhi_sp/2     [length = 2]
>        in r29,__SP_H__
> /* prologue: function */
> /* frame size = 4 */
> /* stack size = 14 */
> .L__stack_usage = 14
>        movw r12,r22     ;  2   *movsi/1        [length = 2]
>        movw r14,r24
>        movw r24,r20     ;  19  *movsi/1        [length = 2]
>        movw r22,r18
>        movw r20,r14     ;  21  *movsi/1        [length = 2]
>        movw r18,r12
>        rcall __mulsi3   ;  7   *mulsi3 [length = 1]
> /* epilogue start */
>        pop __tmp_reg__  ;  41  *addhi3_sp_R_pc2        [length = 4]
>        pop __tmp_reg__
>        pop __tmp_reg__
>        pop __tmp_reg__
>        pop r29  ;  42  popqi   [length = 1]
>        pop r28  ;  43  popqi   [length = 1]
>        pop r15  ;  44  popqi   [length = 1]
>        pop r14  ;  45  popqi   [length = 1]
>        pop r13  ;  46  popqi   [length = 1]
>        pop r12  ;  47  popqi   [length = 1]
>        pop r11  ;  48  popqi   [length = 1]
>        pop r10  ;  49  popqi   [length = 1]
>        pop r9   ;  50  popqi   [length = 1]
>        pop r8   ;  51  popqi   [length = 1]
>        ret      ;  52  return_from_epilogue    [length = 1]
>
>
> With -fsplit-wide-types (which is on per default) the code is even
> worse and the first function inflates to unacceptable code, too.
>
> Using constraints "=RS22,%0,RS18" instead of "=RS22,%RS22,RS18"
> the code of the second function is a bit better:
>
> mul2:
>        push r28         ;  20  *pushqi/1       [length = 1]
>        push r29         ;  21  *pushqi/1       [length = 1]
>        rcall .  ;  25  *addhi3_sp_R_pc2        [length = 2]
>        rcall .
>        in r28,__SP_L__  ;  26  *movhi_sp/2     [length = 2]
>        in r29,__SP_H__
> /* prologue: function */
> /* frame size = 4 */
> /* stack size = 6 */
> .L__stack_usage = 6
>        std Y+1,r22      ;  2   *movsi/4        [length = 4]
>        std Y+2,r23
>        std Y+3,r24
>        std Y+4,r25
>        movw r24,r20     ;  3   *movsi/1        [length = 2]
>        movw r22,r18
>        ldd r18,Y+1      ;  19  *movsi/3        [length = 4]
>        ldd r19,Y+2
>        ldd r20,Y+3
>        ldd r21,Y+4
>        rcall __mulsi3   ;  7   *mulsi3 [length = 1]
> /* epilogue start */
>        pop __tmp_reg__  ;  31  *addhi3_sp_R_pc2        [length = 4]
>        pop __tmp_reg__
>        pop __tmp_reg__
>        pop __tmp_reg__
>        pop r29  ;  32  popqi   [length = 1]
>        pop r28  ;  33  popqi   [length = 1]
>        ret      ;  34  return_from_epilogue    [length = 1]
>
>
> Remember that without register constraints and explicit hard-reg
> move expansion the result was (both with -fsplit-wide-types and
> -fno-split-wide-types):
>
> mul:
>        push r12         ;  35  *pushqi/1       [length = 1]
>        push r13         ;  36  *pushqi/1       [length = 1]
>        push r14         ;  37  *pushqi/1       [length = 1]
>        push r15         ;  38  *pushqi/1       [length = 1]
> /* prologue: function */
> /* frame size = 0 */
> /* stack size = 4 */
> .L__stack_usage = 4
>        movw r12,r22     ;  2   *movsi/1        [length = 2]
>        movw r14,r24
>        movw r24,r20     ;  3   *movsi/1        [length = 2]
>        movw r22,r18
>        movw r20,r14     ;  26  *movsi/1        [length = 2]
>        movw r18,r12
>        rcall __mulsi3   ;  28  *mulsi3_call    [length = 1]
> /* epilogue start */
>        pop r15  ;  41  popqi   [length = 1]
>        pop r14  ;  42  popqi   [length = 1]
>        pop r13  ;  43  popqi   [length = 1]
>        pop r12  ;  44  popqi   [length = 1]
>        ret      ;  45  return_from_epilogue    [length = 1]
>
> So it appears that IRA is not as smart as we thought and not
> prepared for this...


I have tried this method before I have realising the current.
I drop the "right constraints" idea because of "unable to fiand a
register to spill".
Seems that something changed (register allocator changed) but force
move again seems as a best solution.

Denis.

Re: [Patch,AVR]: PR49687 (better widening 32-bit mul)

Reply via email to