Hi,

thanks for adding the EMAC routines!)
I'm appending a patch (RFD) which tries to accomplish four things:

a) align labels to the size of the cache line
b) clears the accumulator before use
c) uses the fetched data to calculate the current
   and the next data. (for .order8 only and assuming an 
   even number of input data)
d) reduce rounding error

Comment:
a) probably very little effect
b) no effect if this is the only routine using the MAC
c) the mac rx,ry instruction uses only one instead of two cycles.
   (if input data is not fetched from zero wait state ram
   there will be an additional benefit)
d) probably very little effect except on very low volume

Not having the toolchain installed and not having any 
hardware I did not test the appended patch (I most likely
introduced some errors).

Be warned, I do not know wether this assembles/performs better/works at all!

Greetings,
Friede
__________________________________________________________________________
Erweitern Sie FreeMail zu einem noch leistungsstärkeren E-Mail-Postfach!        
        
Mehr Infos unter http://freemail.web.de/home/landingpad/?mc=021131

--- coldfire.S.orig	2005-10-27 15:41:05.204083960 +0200
+++ coldfire.S	2005-10-27 16:44:59.888123376 +0200
@@ -25,7 +25,7 @@
  */
     .text
     .global lpc_decode_emac
-    .align 2
+    .align 16                 | cache line is 16 byte
 lpc_decode_emac:
     lea.l (-40, %sp), %sp
     movem.l %d2-%d7/%a2-%a5, (%sp)
@@ -41,6 +41,8 @@
     move.l %d2, %d3 
     neg.l %d3 
     lea.l (%a0, %d3.l*4), %a0 | history
+    movclr.l %acc0, %d3       | acc not assumed zero
+|    movclr.l %acc1, %d3
     clr.l %d3
     move.l %d3, %macsr        | we'll need integer mode for this
     tst.l %d0          
@@ -61,25 +63,41 @@
 
 | last jump table entry coincides with target, so leave it out
 .order8:
+|    asr #1, %d0  | assuming even number of iterations	
     movem.l (%a1), %d3-%d7/%a2-%a4 | load lpc coefs
     move.l (%a0)+, %a5             | load first history sample
 .loop8:
-    mac.l %a5, %a4, (%a0)+, %a5, %acc0
-    mac.l %a5, %a3, (%a0)+, %a5, %acc0
-    mac.l %a5, %a2, (%a0)+, %a5, %acc0
-    mac.l %a5, %d7, (%a0)+, %a5, %acc0
-    mac.l %a5, %d6, (%a0)+, %a5, %acc0
-    mac.l %a5, %d5, (%a0)+, %a5, %acc0
-    mac.l %a5, %d4, (%a0)+, %a5, %acc0
-    mac.l %a5, %d3, (-7*4, %a0), %a5, %acc0 | load for the next iteration
+    mac.l %a5, %a4, (%a0)+, %a5, %acc1
+    mac.l %a5, %a4                 | assuming the (%a0)+ fetch used wait states
+                                   | it might make sense to reuse %a5 for the next calculation
+    mac.l %a5, %a3, (%a0)+, %a5, %acc1
+    mac.l %a5, %a3                 | reuse %a5
+    mac.l %a5, %a2, (%a0)+, %a5, %acc1
+    mac.l %a5, %a2                 | reuse %a5
+    mac.l %a5, %d7, (%a0)+, %a5, %acc1
+    mac.l %a5, %d7                 | reuse %a5
+    mac.l %a5, %d6, (%a0)+, %a5, %acc1
+    mac.l %a5, %d6                 | reuse %a5
+    mac.l %a5, %d5, (%a0)+, %a5, %acc1
+    mac.l %a5, %d5                 | reuse %a5
+    mac.l %a5, %d4, (%a0)+, %a5, %acc1
+    mac.l %a5, %d4, (%a0)+, %a5, %acc0
+    mac.l %a5, %d3
+    mac.l %a5, %d3, (-6*4, %a0), %a5, %acc1 | load for the next iteration
+    movclr.l %acc1, %d2    | get sum
+    asr.l %d1, %d2         | shift sum by lp_quantization bits
+| add #1<<(%d1-1), %d2 | add (table,%d1),%d2 | to minimize rounding error
+    add.l %d2, (%a0)+      | add residual and save
     movclr.l %acc0, %d2    | get sum
     asr.l %d1, %d2         | shift sum by lp_quantization bits
+| add #1<<(%d1-1), %d2 here to minimize rounding error?
     add.l %d2, (%a0)       | add residual and save
-    lea.l (-6*4, %a0), %a0 | point history back at second element
+    lea.l (-5*4, %a0), %a0 | point history back at second element
     subq.l #1, %d0         | decrement counter
     jne .loop8             | are we done?
     jra .exit
 
+    .align 16              | cache line is 16 byte
 .order7:
     movem.l (%a1), %d3-%d7/%a2-%a3
     move.l (%a0)+, %a5
@@ -99,6 +117,7 @@
     jne .loop7
     jra .exit
 
+    .align 16                 | cache line is 16 byte
 .order6:
     movem.l (%a1), %d3-%d7/%a2
     move.l (%a0)+, %a5
@@ -117,6 +136,7 @@
     jne .loop6
     jra .exit
 
+    .align 16                 | cache line is 16 byte
 .order5:
     movem.l (%a1), %d3-%d7
     move.l (%a0)+, %a5
@@ -134,6 +154,7 @@
     jne .loop5
     jra .exit
 
+    .align 16                 | cache line is 16 byte
 .order4:
     movem.l (%a1), %d3-%d6
     move.l (%a0)+, %a5
@@ -150,6 +171,7 @@
     jne .loop4
     jra .exit
 
+    .align 16                 | cache line is 16 byte
 .order3:
     movem.l (%a1), %d3-%d5
     move.l (%a0)+, %a5
@@ -165,6 +187,7 @@
     jne .loop3
     jra .exit
 
+    .align 16                 | cache line is 16 byte
 .order2:
     movem.l (%a1), %d3-%d4
     move.l (%a0)+, %a5
@@ -178,6 +201,7 @@
     jne .loop2
     jra .exit
 
+    .align 16                 | cache line is 16 byte
 .order1:
     | no point in using mac here
     move.l (%a1), %d3
@@ -189,7 +213,8 @@
     subq.l #1, %d0
     jne .loop1
     jra .exit
-    
+
+    .align 16                 | cache line is 16 byte
 .default:
     /* we do the filtering in an unrolled by 4 loop as far as we can, and then
        do the rest in an ordinary one by one sample loop.

Reply via email to