http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50256
Bug #: 50256 Summary: AVR GCC - several unnecessary register moves Classification: Unclassified Product: gcc Version: 4.3.3 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c AssignedTo: unassig...@gcc.gnu.org ReportedBy: nickpar...@eaton.com Hi, AVR GCC seems to generate inefficent code. Function below multiplies two unsigned 24-bit max values, then effectively shifts right by 24 shifts. uint32_t MulU3U3S3(uint32_t a_u3, uint32_t b_u3) { uint32_t answer; asm volatile ( "push r0" "\n\t" "push r1" "\n\t" "clr r20" "\n\t" // zero register // 0 byte shifts "mul %A1,%A2" "\n\t" // a1a2 "mov r2,r0" "\n\t" "mov r3,r1" "\n\t" // 1 byte shifts "mul %A1,%B2" "\n\t" "add r3,r0" "\n\t" "adc r4,r1" "\n\t" "adc r5,r20" "\n\t" "mul %A2,%B1" "\n\t" "add r3,r0" "\n\t" "adc r4,r1" "\n\t" "adc r5,r20" "\n\t" // 2 byte shifts "mul %A1,%C2" "\n\t" "add r4,r0" "\n\t" "adc r5,r1" "\n\t" "adc r6,r20" "\n\t" "mul %A2,%C1" "\n\t" "add r4,r0" "\n\t" "adc r5,r1" "\n\t" "adc r6,r20" "\n\t" "mul %B2,%B1" "\n\t" "add r4,r0" "\n\t" "adc r5,r1" "\n\t" "adc r6,r20" "\n\t" // 3 byte shifts "mul %B1,%C2" "\n\t" "add r5,r0" "\n\t" "adc r6,r1" "\n\t" "adc r7,r20" "\n\t" "mul %B2,%C1" "\n\t" "add r5,r0" "\n\t" "adc r6,r1" "\n\t" "adc r7,r20" "\n\t" // 4 byte shifts "mul %C2,%C1" "\n\t" "add r6,r0" "\n\t" "adc r7,r1" "\n\t" "mov %A0,r5" "\n\t" "mov %B0,r6" "\n\t" "mov %C0,r7" "\n\t" "clr %D0" "\n\t" "pop r1" "\n\t" "pop r0" "\n\t" : "=&r" (answer) : "r" (a_u3), "r" (b_u3) : "r0","r1","r2","r3","r4","r5","r6","r7","r20" ); return (answer); } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Calling code (note moves after function..why cant function leave answer in place?) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 878 040c 6CE5 ldi r22,lo8(167772) 879 040e 7FE8 ldi r23,hi8(167772) 880 0410 82E0 ldi r24,hlo8(167772) 881 0412 90E0 ldi r25,hhi8(167772) 882 0414 20EA ldi r18,lo8(100000) 883 0416 36E8 ldi r19,hi8(100000) 884 0418 41E0 ldi r20,hlo8(100000) 885 041a 50E0 ldi r21,hhi8(100000) 886 041c 0E94 0000 call MulU3U3S3 887 0420 7B01 movw r14,r22 888 0422 8C01 movw r16,r24 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Called code is below. Note that - one argument is unnecessarily moved to a new location - at end, result is unnecessarily moved to a new location also this code is unnecessary too 283 010e 8901 movw r16,r18 284 0110 9A01 movw r18,r20 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 263 .global MulU3U3S3 265 MulU3U3S3: 266 .LFB8: 267 .LM19: 268 .LVL22: 269 00f6 2F92 push r2 270 00f8 3F92 push r3 271 00fa 4F92 push r4 272 00fc 5F92 push r5 273 00fe 6F92 push r6 274 0100 7F92 push r7 275 0102 CF92 push r12 276 0104 DF92 push r13 277 0106 EF92 push r14 278 0108 FF92 push r15 279 010a 0F93 push r16 280 010c 1F93 push r17 281 /* prologue: function */ 282 /* frame size = 0 */ 283 010e 8901 movw r16,r18 284 0110 9A01 movw r18,r20 285 .LM20: 286 0112 6801 movw r12,r16 287 0114 7901 movw r14,r18 288 /* #APP */ 289 ; 326 "maths_mul.c" 1 290 0116 0F92 push r0 291 0118 1F92 push r1 292 011a 4427 clr r20 293 011c 6C9D mul r22,r12 294 011e 202C mov r2,r0 295 0120 312C mov r3,r1 296 0122 6D9D mul r22,r13 297 0124 300C add r3,r0 298 0126 411C adc r4,r1 299 0128 541E adc r5,r20 300 012a C79E mul r12,r23 301 012c 300C add r3,r0 302 012e 411C adc r4,r1 303 0130 541E adc r5,r20 304 0132 6E9D mul r22,r14 305 0134 400C add r4,r0 306 0136 511C adc r5,r1 307 0138 641E adc r6,r20 308 013a C89E mul r12,r24 309 013c 400C add r4,r0 310 013e 511C adc r5,r1 311 0140 641E adc r6,r20 312 0142 D79E mul r13,r23 313 0144 400C add r4,r0 314 0146 511C adc r5,r1 315 0148 641E adc r6,r20 316 014a 7E9D mul r23,r14 317 014c 500C add r5,r0 318 014e 611C adc r6,r1 319 0150 741E adc r7,r20 320 0152 D89E mul r13,r24 321 0154 500C add r5,r0 322 0156 611C adc r6,r1 323 0158 741E adc r7,r20 324 015a E89E mul r14,r24 325 015c 600C add r6,r0 326 015e 711C adc r7,r1 327 0160 052D mov r16,r5 328 0162 162D mov r17,r6 329 0164 272D mov r18,r7 330 0166 3327 clr r19 331 0168 1F90 pop r1 332 016a 0F90 pop r0 333 334 ; 0 "" 2 335 .LVL23: 336 .LM21: 337 /* #NOAPP */ 338 016c B801 movw r22,r16 339 .LVL24: 340 016e C901 movw r24,r18 341 .LVL25: 342 /* epilogue start */ 343 0170 1F91 pop r17 344 0172 0F91 pop r16 345 0174 FF90 pop r15 346 0176 EF90 pop r14 347 0178 DF90 pop r13 348 017a CF90 pop r12 349 017c 7F90 pop r7 350 017e 6F90 pop r6 351 0180 5F90 pop r5 352 0182 4F90 pop r4 353 0184 3F90 pop r3 354 0186 2F90 pop r2 355 0188 0895 ret