[Bug middle-end/65082] Wasted cycles when using a register based varible

2015-02-22 Thread NickParker at Eaton dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65082

--- Comment #4 from NickParker at Eaton dot com ---
That was with 's' optimisation, and it does the sames for optimisation level
'1'.


[Bug middle-end/65082] Wasted cycles when using a register based varible

2015-02-22 Thread NickParker at Eaton dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65082

--- Comment #3 from NickParker at Eaton dot com ---

register uint16_t r4 asm (r4);
register uint16_t r6 asm (r6);
volatile int8_t localOscCosine;
volatile int8_t acInput;

void pllExec(void)
{
  int16_t mix_output_s2=0;
  r4 += r6;
  localOscCosine = pgm_read_byte(cosine7b[r4  8]);
  mix_output_s2 = (localOscCosine * acInput); // GCC-AVR it give 16-bits
};


---
results in.


void pllExec(void)
  39:pll.c  {
  15   .loc 1 39 0
  16   .cfi_startproc
  17   /* prologue: function */
  18   /* frame size = 0 */
  19   /* stack size = 0 */
  20   .L__stack_usage = 0
  21   .LVL0:
  40:pll.c    //int16_t ss;
  41:pll.c    int16_t mix_output_s2=0;
  42:pll.c    r4 += r6;
  22   .loc 1 42 0
  23  F301  movw r30,r6
  24 0002 E40D  add r30,r4
  25 0004 F51D  adc r31,r5
  26 0006 2F01  movw r4,r30
  27   .LVL1:
  28   .LBB2:


[Bug c/65082] New: Wasted cycles when using a register based varible

2015-02-16 Thread NickParker at Eaton dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65082

Bug ID: 65082
   Summary: Wasted cycles when using a register based varible
   Product: gcc
   Version: unknown
Status: UNCONFIRMED
  Severity: enhancement
  Priority: P3
 Component: c
  Assignee: unassigned at gcc dot gnu.org
  Reporter: NickParker at Eaton dot com

gcc version 4.8.0 20130306 (experimental) (GCC) 

Was just playing around and found this.  When using a register based variable,
the compiler misses an obvious optimisation.  

Notice in code below the addition does not take place 'in place' and is instead
performed in scratch/temporary registers and then shifted back to phaseAccPh.
 Why not just add directly to phaseAccPh since in this case it IS register
based already.  It seems that GCC thinks that the variable is still in SRAM
or something else.
Nick.




c code:
-
register uint16_t phaseAccPh  asm (r4);
uint16_t phaseAccFr;

phaseAccPh += phaseAccFr;



asm code:
-
  40:pll.c  void pllExec(void)
  41:pll.c  {
  15   .loc 1 41 0
  16   .cfi_startproc
  17   /* prologue: function */
  18   /* frame size = 0 */
  19   /* stack size = 0 */
  20   .L__stack_usage = 0
  42:pll.c    int16_t mix_output_s2;
  43:pll.c    phaseAccPh += phaseAccFr;
  21   .loc 1 43 0
  22  E091  lds r30,phaseAccFr
  23 0004 F091  lds r31,phaseAccFr+1
  24 0008 E40D  add r30,r4
  25 000a F51D  adc r31,r5
  26 000c 2F01  movw r4,r30


[Bug c/57931] New: There are superfluous movw instructions in when using GCC-AVR (WinAVR) asm code.

2013-07-18 Thread NickParker at Eaton dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57931

Bug ID: 57931
   Summary: There are superfluous movw instructions in when using
GCC-AVR (WinAVR) asm code.
   Product: gcc
   Version: 4.6.2
Status: UNCONFIRMED
  Severity: enhancement
  Priority: P3
 Component: c
  Assignee: unassigned at gcc dot gnu.org
  Reporter: NickParker at Eaton dot com

Created attachment 30526
  -- http://gcc.gnu.org/bugzilla/attachment.cgi?id=30526action=edit
C code and a listing

There are superfluous movw instructions in when using GCC-AVR (WinAVR) asm
code.

Might as well make some tighter code here!


[Bug c/50314] New: GCC changes order of code so it does not work as intended

2011-09-07 Thread NickParker at Eaton dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50314

 Bug #: 50314
   Summary: GCC changes order of code so it does not work as
intended
Classification: Unclassified
   Product: gcc
   Version: 4.3.3
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: c
AssignedTo: unassig...@gcc.gnu.org
ReportedBy: nickpar...@eaton.com


While timing a piece of code I discovered the following.  GCC reordered my code
such that the assembler is not doing things as the C code suggests.
Regards, Nick

C CODE:

  cli();
  time = BGndTimerReadNowIsr();
  result_u2  =  MulU2U2( ZERO_DEGC_IN_DEGK, manPres_u2) / (airTemp_u2 +
ZERO_DEGC_IN_DEGK);
  time = BGndTimerReadNowIsr() - time;
  sei();


Note in ASM code, the timer read is called PRIOR to the division. In the C code
above this is not how the intent is. This may not be a bug, but I certainly
didnt expect GCC to break my C statements and order them as it wishes!

ASM CODE:
113 0092 F894  cli
 114;  0  2
 115   .LM16:
 116   /* #NOAPP */
 117 0094 0E94  call BGndTimerReadNowIsr
 118 0098 9A83  std Y+2,r25
 119 009a 8983  std Y+1,r24
 120   .LM17:
 121 009c 82E1  ldi r24,lo8(4370)
 122 009e 91E1  ldi r25,hi8(4370)
 123 00a0 69E9  ldi r22,lo8(4505)
 124 00a2 71E1  ldi r23,hi8(4505)
 125 00a4 0E94  call MulU2U2
 126 00a8 7B01  movw r14,r22
 127 00aa 8C01  movw r16,r24
 128   .LM18:
 129 00ac 0E94  call BGndTimerReadNowIsr
 130 00b0 2981  ldd r18,Y+1
 131 00b2 3A81  ldd r19,Y+2
 132 00b4 821B  sub r24,r18
 133 00b6 930B  sbc r25,r19
 134   .LVL4:
 135 00b8 9A83  std Y+2,r25
 136 00ba 8983  std Y+1,r24
 137   .LM19:
 138   /* #APP */
 139;  232 tests.c 1
 140 00bc 7894  sei
 141;  0  2
 142   .LM20:
 143   /* #NOAPP */
 144 00be 80E0  ldi r24,lo8(0)
 145   .LVL5:
 146 00c0 60E0  ldi r22,lo8(__c.3250)
 147 00c2 70E0  ldi r23,hi8(__c.3250)
 148 00c4 0E94  call PutFlashString
 149   .LM21:
 150 00c8 80E0  ldi r24,lo8(0)
 151 00ca 60E0  ldi r22,lo8(__c.3252)
 152 00cc 70E0  ldi r23,hi8(__c.3252)
 153 00ce 0E94  call PutFlashString
 154   .LM22:
 155 00d2 C801  movw r24,r16
 156 00d4 B701  movw r22,r14
 157 00d6 22E3  ldi r18,lo8(5170)
 158 00d8 34E1  ldi r19,hi8(5170)
 159 00da 40E0  ldi r20,hlo8(5170)
 160 00dc 50E0  ldi r21,hhi8(5170)
 161 00de 0E94  call __udivmodsi4


[Bug c/50314] GCC changes order of code so it does not work as intended

2011-09-07 Thread NickParker at Eaton dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50314

--- Comment #3 from NickParker at Eaton dot com 2011-09-07 21:13:22 UTC ---
#define T1_GET_TIMER_NON_ATOMIC()   (TCNT1)

uint16_t BGndTimerReadNowIsr(void)
{
  uint16_t period_u2;
  period_u2 = T1_GET_TIMER_NON_ATOMIC();
  return(period_u2);
}


[Bug c/50314] GCC changes order of code so it does not work as intended

2011-09-07 Thread NickParker at Eaton dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50314

--- Comment #5 from NickParker at Eaton dot com 2011-09-07 21:16:02 UTC ---
So if its not a bug, what is the solution to this problem?

I tried wrapping up the code I wanted to time in a separate C function, but the
compiler still optimised my problem back.

Regards, Nick


[Bug c/50314] GCC changes order of code so it does not work as intended

2011-09-07 Thread NickParker at Eaton dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50314

--- Comment #6 from NickParker at Eaton dot com 2011-09-07 21:18:59 UTC ---
 How is TCNT1 defined?  Again just attach the preprocessed source.

Sorry, not sure how to get preprocessed source - I need to read the GCC manual.

From AVR header files for the ATmega128 :-

(iom128.h)

#define TCNT1 _SFR_IO16(0x2C)


[Bug target/50281] result registers are overwritten giving incorrect result

2011-09-04 Thread NickParker at Eaton dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50281

--- Comment #10 from NickParker at Eaton dot com 2011-09-04 21:22:30 UTC ---

Richard, 'bogus' isnt a technical term I'm familiar with - I'm not entirely
sure
what you mean, however, I have found the problem with my ASM code.

If you'll notice I am adding partial products to r4,r5,r6,r7 without them ever
being initialised.  What was missing was initialistion of the temporary result
registers r4,r5,r6,r7 where I am generating my result.   

After adding these initialisatons, and also a few movw's for a few less
cycles in a couple of places, the code now works correctly.

Thanks for your input and sorry that I made a mess of reporting this bug, which
was nothing to do with the compiler.

Thanks, Nick.


[Bug target/50256] AVR GCC - several unnecessary register moves

2011-09-04 Thread NickParker at Eaton dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50256

--- Comment #4 from NickParker at Eaton dot com 2011-09-04 21:27:43 UTC ---
Hi,

Thanks for your input and sorry to make a mess of reporting this.
The arithmetic is fine and the code gives the results I expect.
However, I have since discovered and corrected a 'bug'.  
I was not initialising my 'temporary result registers i.e. r2,r3,r4,r5,r6,r7.

Regarding the arithmetic, the result is INTENTIONALLY accumulated from the
lowest partial products to highest partial products so that at most only 3
additions are ever needed per partial product.


Thanks, Nick


[Bug target/50256] AVR GCC - several unnecessary register moves

2011-09-04 Thread NickParker at Eaton dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50256

--- Comment #5 from NickParker at Eaton dot com 2011-09-04 21:29:40 UTC ---
Thanks, also was able to add a few movws to save cycles.
Nick.


[Bug c/50281] New: result registers are overwritten giving incorrect result

2011-09-02 Thread NickParker at Eaton dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50281

 Bug #: 50281
   Summary: result registers are overwritten giving incorrect
result
Classification: Unclassified
   Product: gcc
   Version: 4.3.3
Status: UNCONFIRMED
  Severity: major
  Priority: P3
 Component: c
AssignedTo: unassig...@gcc.gnu.org
ReportedBy: nickpar...@eaton.com


Application code

bool8_t testMaths(void)
{
  uint32_t result1_u4;
  uint32_t result2_u4;
  int32_t result1_s4;
  int32_t result2_s4;
  //;
  // Multiplying U3s
  //;
  result1_u4 = MulU3U3S3(16777215L,100L);  // should be around 100

  PutImmediateString(ECU_COMMS,\r\nmulu3u3s3 : [ );
  PrintINT4(ECU_COMMS, 16777215L, 'D',0);
  PutImmediateString(ECU_COMMS, ] x [);
  PrintINT4(ECU_COMMS, 100L, 'D',0);
  PutImmediateString(ECU_COMMS, ] = [);
  PrintINT4(ECU_COMMS, result1_u4, 'D',0);
  PutImmediateString(ECU_COMMS, ]);
.
.
.
.
}


/***
* MulU3U3S3()
* 
* Function: Multiplies two unsigned 24bit max values, 
*   then shifts left by 2^24
***/
uint32_t MulU3U3S3(uint32_t a_u4, uint32_t b_u4)
{
uint32_t answer;

asm volatile
(

push r0   \n\t
push r1   \n\t

clr r20   \n\t  // zero register

// 0 byte shifts
mul %A1,%A2   \n\t  // a1a2
mov r2,r0 \n\t
mov r3,r1 \n\t

// 1 byte shifts
mul %A1,%B2  \n\t
add r3,r0\n\t
adc r4,r1\n\t
adc r5,r20   \n\t

mul %A2,%B1  \n\t
add r3,r0\n\t
adc r4,r1\n\t
adc r5,r20   \n\t

// 2 byte shifts
mul %A1,%C2   \n\t
add r4,r0\n\t
adc r5,r1\n\t
adc r6,r20   \n\t

mul %A2,%C1   \n\t
add r4,r0\n\t
adc r5,r1\n\t
adc r6,r20   \n\t

mul %B2,%B1   \n\t
add r4,r0\n\t
adc r5,r1\n\t
adc r6,r20   \n\t

// 3 byte shifts
mul %B1,%C2   \n\t
add r5,r0\n\t
adc r6,r1\n\t
adc r7,r20   \n\t

mul %B2,%C1   \n\t
add r5,r0\n\t
adc r6,r1\n\t
adc r7,r20   \n\t

// 4 byte shifts
mul %C2,%C1   \n\t
add r6,r0\n\t
adc r7,r1\n\t

mov %A0,r5   \n\t
mov %B0,r6   \n\t
mov %C0,r7   \n\t
clr %D0  \n\t

//adc %G0,r20   \n\t
pop r1\n\t
pop r0\n\t

: =r (answer)
: r (a_u4), r (b_u4)
: r2,r3,r4,r5,r6,r7,r20
);

return (answer);
}


[Bug c/50281] result registers are overwritten giving incorrect result

2011-09-02 Thread NickParker at Eaton dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50281

NickParker at Eaton dot com changed:

   What|Removed |Added

  Component|inline-asm  |c
   Severity|normal  |major

--- Comment #1 from NickParker at Eaton dot com 2011-09-03 01:09:21 UTC ---
top level function to test mulu3u3s3 function and print result
-
 932   .LM119:
 933 0478 6FEF  ldi r22,lo8(16777215)
 934 047a 7FEF  ldi r23,hi8(16777215)
 935 047c 8FEF  ldi r24,hlo8(16777215)
 936 047e 90E0  ldi r25,hhi8(16777215)
 937 0480 24E6  ldi r18,lo8(100)
 938 0482 30E0  ldi r19,hi8(100)
 939 0484 40E0  ldi r20,hlo8(100)
 940 0486 50E0  ldi r21,hhi8(100)
 941 0488 0E94  call MulU3U3S3
 942 048c 6B01  movw r12,r22
 943 048e 7C01  movw r14,r24
 944   .LVL128:
 945   .LM120:
 946 0490 80E0  ldi r24,lo8(0)
 947 0492 60E0  ldi r22,lo8(__c.2370)
 948 0494 70E0  ldi r23,hi8(__c.2370)
 949 0496 0E94  call PutFlashString
 950   .LM121:
 951 049a 80E0  ldi r24,lo8(0)
 952 049c 4FEF  ldi r20,lo8(16777215)
 953 049e 5FEF  ldi r21,hi8(16777215)
 954 04a0 6FEF  ldi r22,hlo8(16777215)
 955 04a2 70E0  ldi r23,hhi8(16777215)
 956 04a4 24E4  ldi r18,lo8(68)
 957 04a6 0E94  call PrintINT4
 958   .LM122:
 959 04aa 80E0  ldi r24,lo8(0)
 960 04ac 60E0  ldi r22,lo8(__c.2372)
 961 04ae 70E0  ldi r23,hi8(__c.2372)
 962 04b0 0E94  call PutFlashString
 963   .LM123:
 964 04b4 80E0  ldi r24,lo8(0)
 965 04b6 44E6  ldi r20,lo8(100)
 966 04b8 50E0  ldi r21,hi8(100)
 967 04ba 60E0  ldi r22,hlo8(100)
 968 04bc 70E0  ldi r23,hhi8(100)
 969 04be 24E4  ldi r18,lo8(68)
 970 04c0 0E94  call PrintINT4
 971   .LM124:
 972 04c4 80E0  ldi r24,lo8(0)
 973 04c6 60E0  ldi r22,lo8(__c.2374)
 974 04c8 70E0  ldi r23,hi8(__c.2374)
 975 04ca 0E94  call PutFlashString
 976   .LM125:
 977 04ce 80E0  ldi r24,lo8(0)
 978 04d0 B701  movw r22,r14
 979 04d2 A601  movw r20,r12
 980 04d4 24E4  ldi r18,lo8(68)
 981 04d6 0E94  call PrintINT4
 982   .LM126:
 983 04da 80E0  ldi r24,lo8(0)
 984 04dc 60E0  ldi r22,lo8(__c.2376)
 985 04de 70E0  ldi r23,hi8(__c.2376)
 986 04e0 0E94  call PutFlashString

-maths code 

259   .globalMulU3U3S3
 261   MulU3U3S3:
 262   .LFB8:
 263   .LM19:
 264   .LVL22:
 265 00ee 2F92  push r2
 266 00f0 3F92  push r3
 267 00f2 4F92  push r4
 268 00f4 5F92  push r5
 269 00f6 6F92  push r6
 270 00f8 7F92  push r7
 271 00fa AF92  push r10
 272 00fc EF92  push r14
 273 00fe FF92  push r15
 274 0100 0F93  push r16
 275 0102 1F93  push r17
 276   /* prologue: function */
 277   /* frame size = 0 */
 278   .LM20:
 279 0104 7901  movw r14,r18
 280 0106 8A01  movw r16,r20
 281   /* #APP */
 282;  324 maths_mul.c 1
 283 0108 0F92  push r0
 284 010a 1F92  push r1
 285 010c AF92  push r10
 286 010e AA24  clr r10
 287 0110 6E9D  mul r22,r14
 288 0112 202C  mov r2,r0
 289 0114 312C  mov r3,r1
 290 0116 6F9D  mul r22,r15
 291 0118 300C  add r3,r0
 292 011a 411C  adc r4,r1
 293 011c 5A1C  adc r5,r10
 294 011e E79E  mul r14,r23
 295 0120 300C  add r3,r0
 296 0122 411C  adc r4,r1
 297 0124 5A1C  adc r5,r10
 298 0126 609F  mul r22,r16
 299 0128 400C  add r4,r0
 300 012a 511C  adc r5,r1
 301 012c 6A1C  adc r6,r10
 302 012e E89E  mul r14,r24
 303 0130 400C  add r4,r0
 304 0132 511C  adc r5,r1
 305 0134 6A1C  adc r6,r10
 306 0136 F79E  mul r15,r23
 307 0138 400C  add r4,r0
 308 013a 511C  adc r5,r1
 309 013c 6A1C  adc r6,r10
 310 013e 709F  mul r23,r16
 311 0140 500C  add r5,r0
 312

[Bug target/50281] result registers are overwritten giving incorrect result

2011-09-02 Thread NickParker at Eaton dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50281

--- Comment #3 from NickParker at Eaton dot com 2011-09-03 01:28:57 UTC ---
The final printed calculation result of MulU3U3S3() is wrong, because two of
the four result registers are incorrect and have been overwritten.

mulu3u3s3 : [ +0016777215 ] x [+000100 ] = [+0010502615 ]

I am wondering if the CPU is running out of registers?

Because I have found stepping through that the calcualtion is actually working
correctly


[Bug target/50281] result registers are overwritten giving incorrect result

2011-09-02 Thread NickParker at Eaton dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50281

--- Comment #4 from NickParker at Eaton dot com 2011-09-03 01:30:26 UTC ---
Hi Andrew,
Can you please explain what you mean by %1 and %2. Thanks.


[Bug target/50281] result registers are overwritten giving incorrect result

2011-09-02 Thread NickParker at Eaton dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50281

--- Comment #5 from NickParker at Eaton dot com 2011-09-03 01:32:20 UTC ---
Sorry. I pasted a broken version. Before. Code below works.


uint32_t MulU3U3S3(uint32_t a_u4, uint32_t b_u4)
{
//uint32_t answer;

asm volatile
(

push r0  \n\t
push r1  \n\t
push r10 \n\t

clr r10  \n\t  // zero register

// 0 byte shifts
mul %A1,%A2  \n\t  // a1a2
mov r2,r0\n\t
mov r3,r1\n\t

// 1 byte shifts
mul %A1,%B2  \n\t
add r3,r0\n\t
adc r4,r1\n\t
adc r5,r10   \n\t

mul %A2,%B1  \n\t
add r3,r0\n\t
adc r4,r1\n\t
adc r5,r10   \n\t

// 2 byte shifts
mul %A1,%C2  \n\t
add r4,r0\n\t
adc r5,r1\n\t
adc r6,r10   \n\t

mul %A2,%C1  \n\t
add r4,r0\n\t
adc r5,r1\n\t
adc r6,r10   \n\t

mul %B2,%B1  \n\t
add r4,r0\n\t
adc r5,r1\n\t
adc r6,r10   \n\t

// 3 byte shifts
mul %B1,%C2  \n\t
add r5,r0\n\t
adc r6,r1\n\t
adc r7,r10   \n\t

mul %B2,%C1  \n\t
add r5,r0\n\t
adc r6,r1\n\t
adc r7,r10   \n\t

// 4 byte shifts
mul %C2,%C1  \n\t
add r6,r0\n\t
adc r7,r1\n\t

mov %A0,r5   \n\t
mov %B0,r6   \n\t
mov %C0,r7   \n\t
clr %D0  \n\t

//adc %G0,r20  \n\t
pop r10  \n\t
pop r1   \n\t
pop r0   \n\t

: =r (answer)
: r (a_u4), r (b_u4)
: r2,r3,r4,r5,r6,r7,r10
);

return (answer);
}


[Bug target/50281] result registers are overwritten giving incorrect result

2011-09-02 Thread NickParker at Eaton dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50281

--- Comment #7 from NickParker at Eaton dot com 2011-09-03 04:45:08 UTC ---
Please ignore the r10/r20 guff I was experimenting. I later realised the
muls3s3u3 code gives the right answer, the problem occurs later on somehow
Nick.


[Bug target/50281] result registers are overwritten giving incorrect result

2011-09-02 Thread NickParker at Eaton dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50281

--- Comment #8 from NickParker at Eaton dot com 2011-09-03 04:46:45 UTC ---
Please ignore the r10/r20 guff I was experimenting. I later realised the
muls3s3u3 code gives the right answer, the problem occurs later on somehow the
registers where the results are are getting walked on. 
Nick.


[Bug target/50256] AVR GCC - several unnecessary register moves

2011-09-01 Thread NickParker at Eaton dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50256

--- Comment #2 from NickParker at Eaton dot com 2011-09-01 21:30:41 UTC ---
I have the latest AVR GCC release from 2010-01-10 (4.3.3) and this is what I am
using.


[Bug c/50256] New: AVR GCC - several unnecessary register moves

2011-08-31 Thread NickParker at Eaton dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50256

 Bug #: 50256
   Summary: AVR GCC - several unnecessary register moves
Classification: Unclassified
   Product: gcc
   Version: 4.3.3
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: c
AssignedTo: unassig...@gcc.gnu.org
ReportedBy: nickpar...@eaton.com


Hi,

AVR GCC seems to generate inefficent code.  Function below multiplies two
unsigned 24-bit max values, then effectively shifts right by 24 shifts.

uint32_t MulU3U3S3(uint32_t a_u3, uint32_t b_u3)
{
uint32_t answer;

asm volatile
(

push r0  \n\t
push r1  \n\t

clr r20  \n\t  // zero register

// 0 byte shifts
mul %A1,%A2  \n\t  // a1a2
mov r2,r0\n\t
mov r3,r1\n\t

// 1 byte shifts
mul %A1,%B2  \n\t
add r3,r0\n\t
adc r4,r1\n\t
adc r5,r20   \n\t

mul %A2,%B1  \n\t
add r3,r0\n\t
adc r4,r1\n\t
adc r5,r20   \n\t

// 2 byte shifts
mul %A1,%C2  \n\t
add r4,r0\n\t
adc r5,r1\n\t
adc r6,r20   \n\t

mul %A2,%C1  \n\t
add r4,r0\n\t
adc r5,r1\n\t
adc r6,r20   \n\t

mul %B2,%B1  \n\t
add r4,r0\n\t
adc r5,r1\n\t
adc r6,r20   \n\t

// 3 byte shifts
mul %B1,%C2  \n\t
add r5,r0\n\t
adc r6,r1\n\t
adc r7,r20   \n\t

mul %B2,%C1  \n\t
add r5,r0\n\t
adc r6,r1\n\t
adc r7,r20   \n\t

// 4 byte shifts
mul %C2,%C1  \n\t
add r6,r0\n\t
adc r7,r1\n\t

mov %A0,r5   \n\t
mov %B0,r6   \n\t
mov %C0,r7   \n\t
clr %D0  \n\t

pop r1\n\t
pop r0\n\t

: =r (answer)
: r (a_u3), r (b_u3)
: r0,r1,r2,r3,r4,r5,r6,r7,r20
);

return (answer);
}

;
Calling code 
(note moves after function..why cant function leave answer in place?)
;

 878 040c 6CE5  ldi r22,lo8(167772)
 879 040e 7FE8  ldi r23,hi8(167772)
 880 0410 82E0  ldi r24,hlo8(167772)
 881 0412 90E0  ldi r25,hhi8(167772)
 882 0414 20EA  ldi r18,lo8(10)
 883 0416 36E8  ldi r19,hi8(10)
 884 0418 41E0  ldi r20,hlo8(10)
 885 041a 50E0  ldi r21,hhi8(10)
 886 041c 0E94  call MulU3U3S3
 887 0420 7B01  movw r14,r22
 888 0422 8C01  movw r16,r24

;
Called code is below. Note that
- one argument is unnecessarily moved to a new location
- at end, result is unnecessarily moved to a new location

also this code is unnecessary too

 283 010e 8901  movw r16,r18
 284 0110 9A01  movw r18,r20

;

263.global MulU3U3S3
 265MulU3U3S3:
 266.LFB8:
 267.LM19:
 268.LVL22:
 269 00f6 2F92  push r2
 270 00f8 3F92  push r3
 271 00fa 4F92  push r4
 272 00fc 5F92  push r5
 273 00fe 6F92  push r6
 274 0100 7F92  push r7
 275 0102 CF92  push r12
 276 0104 DF92  push r13
 277 0106 EF92  push r14
 278 0108 FF92  push r15
 279 010a 0F93  push r16
 280 010c 1F93  push r17
 281/* prologue: function */
 282/* frame size = 0 */
 283 010e 8901  movw r16,r18
 284 0110 9A01  movw r18,r20
 285.LM20:
 286 0112 6801  movw r12,r16
 287 0114 7901  movw r14,r18
 288/* #APP */
 289 ;  326 maths_mul.c 1
 290 0116 0F92  push r0
 291 0118 1F92  push r1
 292 011a 4427  clr r20
 293 011c 6C9D  mul r22,r12
 294 011e 202C  mov r2,r0
 295 0120 312C  mov r3,r1
 296 0122 6D9D  mul r22,r13
 297 0124 300C  add r3,r0

[Bug c/50223] New: AVRGCC - dont clear r26 and r27.....its a (small) waste of CPU cycles.

2011-08-28 Thread NickParker at Eaton dot com
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50223

 Bug #: 50223
   Summary: AVRGCC - dont clear r26 and r27.its a (small)
waste of CPU cycles.
Classification: Unclassified
   Product: gcc
   Version: unknown
Status: UNCONFIRMED
  Severity: enhancement
  Priority: P3
 Component: c
AssignedTo: unassig...@gcc.gnu.org
ReportedBy: nickpar...@eaton.com
  Host: PC Windows XP
Target: AVR Mega 128
 Build: avr-gcc (WinAVR 20100110) 4.3.3


Dont clear r26 and r27.its a (small) waste of CPU cycles.
Regards, Nick.


This function normalises a 32bit unsigned integer and returns
the number of shifts.


uint8_t ldgZeroCntNormU32(uint32_t * x)
{
  uint8_t zCount=0;
  uint8_t shft;
  uint32_t quad;
  quad=*x;

  while (!(uint8_t)(quad  24))
  {
zCount += 8;
quad =8;
  }
  shft = pgm_read_byte(leadingZeros[(uint8_t)(quad  24)]);
  *x = quad  shft;
  return (zCount + shft);
}


   .file   divu16u16.c
   2__SREG__ = 0x3f
   3__SP_H__ = 0x3e
   4__SP_L__ = 0x3d
   5__CCP__  = 0x34
   6__tmp_reg__ = 0
   7__zero_reg__ = 1
  15.Ltext0:
  16.global ldgZeroCntNormU32
  18ldgZeroCntNormU32:
  19.LFB12:
  20.LM1:
  21.LVL0:
  22  CF93  push r28
  23 0002 DF93  push r29
  24/* prologue: function */
  25/* frame size = 0 */
  26 0004 EC01  movw r28,r24
  27.LM2:
  28 0006 2881  ld r18,Y
  29 0008 3981  ldd r19,Y+1
  30 000a 4A81  ldd r20,Y+2
  31 000c 5B81  ldd r21,Y+3
  32.LVL1:
  33.LM3:
  34 000e 852F  mov r24,r21
  35 0010 9927  clr r25
  36 0012 AA27  clr r26
  37 0014 BB27  clr r27
  38.LVL2:
  39 0016 E82F  mov r30,r24
  40 0018 8823  tst r24
  41 001a 01F4  brne .L8
  42 001c 60E0  ldi r22,lo8(0)
  43.LVL3:
  44.L4:
  45.LM4:
  46 001e 685F  subi r22,lo8(-(8))
  47.LM5:
  48 0020 542F  mov r21,r20
  49 0022 432F  mov r20,r19
  50 0024 322F  mov r19,r18
  51 0026 2227  clr r18
  52.LM6:
  53 0028 852F  mov r24,r21
  54 002a 9927  clr r25
  55 002c AA27  clr r26
  56 002e BB27  clr r27
  57 0030 E82F  mov r30,r24
  58 0032 8823  tst r24
  59 0034 01F0  breq .L4
  60.L3:
  61.LBB2:
  62.LM7:
  63 0036 F0E0  ldi r31,lo8(0)
  64 0038 E050  subi r30,lo8(-(leadingZeros))
  65 003a F040  sbci r31,hi8(-(leadingZeros))
  66/* #APP */
  67 ;  111 divu16u16.c 1
  68 003c E491  lpm r30, Z
  69
  70 ;  0  2
  71.LVL4:
  72/* #NOAPP */
  73.LBE2:
  74.LM8:
  75 003e 0E2E  mov r0,r30
  76 0040 00C0  rjmp 2f
  77 0042 220F  1:  lsl r18
  78 0044 331F  rol r19
  79 0046 441F  rol r20
  80 0048 551F  rol r21
  81 004a 0A94  2:  dec r0
  82 004c 02F4  brpl 1b
  83 004e 2883  st Y,r18
  84 0050 3983  std Y+1,r19
  85 0052 4A83  std Y+2,r20
  86 0054 5B83  std Y+3,r21
  87.LM9:
  88 0056 8E2F  mov r24,r30
  89 0058 860F  add r24,r22
  90/* epilogue start */
  91 005a DF91  pop r29
  92 005c CF91  pop r28
  93.LVL5:
  94 005e 0895  ret