Scaling on output leads to respectively 32 or 64 multiplications, while doing
it on the input is respectively 8 or 4 multiplications. As a side note,
the trick can be used in other implementations.
Win64: 728 to 562 cycles.
---
libavcodec/dcadsp.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
index 8d242c5..a015051 100644
--- a/libavcodec/dcadsp.c
+++ b/libavcodec/dcadsp.c
@@ -39,19 +39,23 @@ dca_lfe_fir(float *out, const float *in, const float *coefs,
float *out2 = out + decifactor;
const float *cf0 = coefs;
const float *cf1 = coefs + 256;
+ float scaled_in[8];
int j, k;
+ for (j = 0; j < 256 / decifactor; j++)
+ scaled_in[j] = in[-j] * scale;
+
/* One decimated sample generates 2*decifactor interpolated ones */
for (k = 0; k < decifactor; k++) {
float v0 = 0.0;
float v1 = 0.0;
for (j = 0; j < 256 / decifactor; j++) {
- float s = in[-j];
+ float s = scaled_in[j];
v0 += s * *cf0++;
v1 += s * *--cf1;
}
- *out++ = v0 * scale;
- *out2++ = v1 * scale;
+ *out++ = v0;
+ *out2++ = v1;
}
}
--
1.8.0.msysgit.0
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel