Hi,
thanks for adding the EMAC routines!)
I'm appending a patch (RFD) which tries to accomplish four things:
a) align labels to the size of the cache line
b) clears the accumulator before use
c) uses the fetched data to calculate the current
and the next data. (for .order8 only and assuming an
even number of input data)
d) reduce rounding error
Comment:
a) probably very little effect
b) no effect if this is the only routine using the MAC
c) the mac rx,ry instruction uses only one instead of two cycles.
(if input data is not fetched from zero wait state ram
there will be an additional benefit)
d) probably very little effect except on very low volume
Not having the toolchain installed and not having any
hardware I did not test the appended patch (I most likely
introduced some errors).
Be warned, I do not know wether this assembles/performs better/works at all!
Greetings,
Friede
__________________________________________________________________________
Erweitern Sie FreeMail zu einem noch leistungsstärkeren E-Mail-Postfach!
Mehr Infos unter http://freemail.web.de/home/landingpad/?mc=021131
--- coldfire.S.orig 2005-10-27 15:41:05.204083960 +0200
+++ coldfire.S 2005-10-27 16:44:59.888123376 +0200
@@ -25,7 +25,7 @@
*/
.text
.global lpc_decode_emac
- .align 2
+ .align 16 | cache line is 16 byte
lpc_decode_emac:
lea.l (-40, %sp), %sp
movem.l %d2-%d7/%a2-%a5, (%sp)
@@ -41,6 +41,8 @@
move.l %d2, %d3
neg.l %d3
lea.l (%a0, %d3.l*4), %a0 | history
+ movclr.l %acc0, %d3 | acc not assumed zero
+| movclr.l %acc1, %d3
clr.l %d3
move.l %d3, %macsr | we'll need integer mode for this
tst.l %d0
@@ -61,25 +63,41 @@
| last jump table entry coincides with target, so leave it out
.order8:
+| asr #1, %d0 | assuming even number of iterations
movem.l (%a1), %d3-%d7/%a2-%a4 | load lpc coefs
move.l (%a0)+, %a5 | load first history sample
.loop8:
- mac.l %a5, %a4, (%a0)+, %a5, %acc0
- mac.l %a5, %a3, (%a0)+, %a5, %acc0
- mac.l %a5, %a2, (%a0)+, %a5, %acc0
- mac.l %a5, %d7, (%a0)+, %a5, %acc0
- mac.l %a5, %d6, (%a0)+, %a5, %acc0
- mac.l %a5, %d5, (%a0)+, %a5, %acc0
- mac.l %a5, %d4, (%a0)+, %a5, %acc0
- mac.l %a5, %d3, (-7*4, %a0), %a5, %acc0 | load for the next iteration
+ mac.l %a5, %a4, (%a0)+, %a5, %acc1
+ mac.l %a5, %a4 | assuming the (%a0)+ fetch used wait states
+ | it might make sense to reuse %a5 for the next calculation
+ mac.l %a5, %a3, (%a0)+, %a5, %acc1
+ mac.l %a5, %a3 | reuse %a5
+ mac.l %a5, %a2, (%a0)+, %a5, %acc1
+ mac.l %a5, %a2 | reuse %a5
+ mac.l %a5, %d7, (%a0)+, %a5, %acc1
+ mac.l %a5, %d7 | reuse %a5
+ mac.l %a5, %d6, (%a0)+, %a5, %acc1
+ mac.l %a5, %d6 | reuse %a5
+ mac.l %a5, %d5, (%a0)+, %a5, %acc1
+ mac.l %a5, %d5 | reuse %a5
+ mac.l %a5, %d4, (%a0)+, %a5, %acc1
+ mac.l %a5, %d4, (%a0)+, %a5, %acc0
+ mac.l %a5, %d3
+ mac.l %a5, %d3, (-6*4, %a0), %a5, %acc1 | load for the next iteration
+ movclr.l %acc1, %d2 | get sum
+ asr.l %d1, %d2 | shift sum by lp_quantization bits
+| add #1<<(%d1-1), %d2 | add (table,%d1),%d2 | to minimize rounding error
+ add.l %d2, (%a0)+ | add residual and save
movclr.l %acc0, %d2 | get sum
asr.l %d1, %d2 | shift sum by lp_quantization bits
+| add #1<<(%d1-1), %d2 here to minimize rounding error?
add.l %d2, (%a0) | add residual and save
- lea.l (-6*4, %a0), %a0 | point history back at second element
+ lea.l (-5*4, %a0), %a0 | point history back at second element
subq.l #1, %d0 | decrement counter
jne .loop8 | are we done?
jra .exit
+ .align 16 | cache line is 16 byte
.order7:
movem.l (%a1), %d3-%d7/%a2-%a3
move.l (%a0)+, %a5
@@ -99,6 +117,7 @@
jne .loop7
jra .exit
+ .align 16 | cache line is 16 byte
.order6:
movem.l (%a1), %d3-%d7/%a2
move.l (%a0)+, %a5
@@ -117,6 +136,7 @@
jne .loop6
jra .exit
+ .align 16 | cache line is 16 byte
.order5:
movem.l (%a1), %d3-%d7
move.l (%a0)+, %a5
@@ -134,6 +154,7 @@
jne .loop5
jra .exit
+ .align 16 | cache line is 16 byte
.order4:
movem.l (%a1), %d3-%d6
move.l (%a0)+, %a5
@@ -150,6 +171,7 @@
jne .loop4
jra .exit
+ .align 16 | cache line is 16 byte
.order3:
movem.l (%a1), %d3-%d5
move.l (%a0)+, %a5
@@ -165,6 +187,7 @@
jne .loop3
jra .exit
+ .align 16 | cache line is 16 byte
.order2:
movem.l (%a1), %d3-%d4
move.l (%a0)+, %a5
@@ -178,6 +201,7 @@
jne .loop2
jra .exit
+ .align 16 | cache line is 16 byte
.order1:
| no point in using mac here
move.l (%a1), %d3
@@ -189,7 +213,8 @@
subq.l #1, %d0
jne .loop1
jra .exit
-
+
+ .align 16 | cache line is 16 byte
.default:
/* we do the filtering in an unrolled by 4 loop as far as we can, and then
do the rest in an ordinary one by one sample loop.