Package: libzita-resampler1 Version: 1.3.0-2 Severity: wishlist Tags: upstream patch
Hi, Please find attached a patch for SSE-optimizing resampling of stereo signals; it makes this more or less three times as fast on Intel systems, without sacrificing any quality. I talked to upstream about this patch back in the day, and he seemed happy to accept it, but somehow just stopped answering -- I guess he got busy with other things in life. It would be nice if we could get it into Debian nevertheless (I want it for reducing CPU used in my realtime video mixer). It is taken to be by Steinar H. Gunderson <se...@google.com> (ie., work hat from my ex-job), and licensed under the same terms as zita-resampler itself (ie., GPLv3+). -- System Information: Debian Release: stretch/sid APT prefers unstable APT policy: (500, 'unstable'), (500, 'testing'), (1, 'experimental') Architecture: amd64 (x86_64) Foreign Architectures: i386, armhf Kernel: Linux 4.7.0-rc7 (SMP w/4 CPU cores) Locale: LANG=nb_NO.utf8, LC_CTYPE=nb_NO.utf8 (charmap=UTF-8) Shell: /bin/sh linked to /bin/dash Init: systemd (via /run/systemd/system) Versions of packages libzita-resampler1 depends on: ii libc6 2.23-2 ii libgcc1 1:6.1.1-9 ii libstdc++6 6.1.1-9 ii multiarch-support 2.23-2 libzita-resampler1 recommends no packages. libzita-resampler1 suggests no packages. -- no debconf information
diff -ur orig/zita-resampler-1.3.0/libs/resampler.cc zita-resampler-1.3.0/libs/resampler.cc --- orig/zita-resampler-1.3.0/libs/resampler.cc 2012-10-26 22:58:55.000000000 +0200 +++ zita-resampler-1.3.0/libs/resampler.cc 2015-11-15 12:27:42.764591015 +0100 @@ -24,6 +24,10 @@ #include <math.h> #include <zita-resampler/resampler.h> +#ifdef __SSE2__ +#include <xmmintrin.h> +#endif + static unsigned int gcd (unsigned int a, unsigned int b) { @@ -47,6 +51,45 @@ return 1; } +#ifdef __SSE2__ + +static inline void calc_stereo_sample_sse (unsigned int hl, + float *c1, + float *c2, + float *q1, + float *q2, + float *out_data) +{ + unsigned int i; + __m128 denorm, s, w1, w2; + + denorm = _mm_set1_ps (1e-20f); + s = denorm; + for (i = 0; i < hl; i += 4) + { + q2 -= 8; + + // s += *q1 * c1 [i]; + w1 = _mm_loadu_ps (&c1 [i]); + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1), _mm_unpacklo_ps (w1, w1))); + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + 4), _mm_unpackhi_ps (w1, w1))); + + // s += *q2 * c2 [i]; + w2 = _mm_loadu_ps (&c2 [i]); + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + 4), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (0, 0, 1, 1)))); + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (2, 2, 3, 3)))); + + q1 += 8; + } + s = _mm_sub_ps (s, denorm); + s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2))); + + // Writes two bytes more than we want, but this is fine since out_count >= 2. + _mm_storeu_ps (out_data, s); +} + +#endif + Resampler::Resampler (void) : _table (0), @@ -213,18 +256,28 @@ { float *c1 = _table->_ctab + hl * ph; float *c2 = _table->_ctab + hl * (np - ph); - for (c = 0; c < _nchan; c++) +#ifdef __SSE2__ + if ((hl % 4) == 0 && _nchan == 2 && out_count >= 2) { - float *q1 = p1 + c; - float *q2 = p2 + c; - float s = 1e-20f; - for (i = 0; i < hl; i++) + calc_stereo_sample_sse (hl, c1, c2, p1, p2, out_data); + out_data += 2; + } + else +#endif + { + for (c = 0; c < _nchan; c++) { - q2 -= _nchan; - s += *q1 * c1 [i] + *q2 * c2 [i]; - q1 += _nchan; + float *q1 = p1 + c; + float *q2 = p2 + c; + float s = 1e-20f; + for (i = 0; i < hl; i++) + { + q2 -= _nchan; + s += *q1 * c1 [i] + *q2 * c2 [i]; + q1 += _nchan; + } + *out_data++ = s - 1e-20f; } - *out_data++ = s - 1e-20f; } } else @@ -260,4 +313,3 @@ return 0; } - diff -ur orig/zita-resampler-1.3.0/libs/vresampler.cc zita-resampler-1.3.0/libs/vresampler.cc --- orig/zita-resampler-1.3.0/libs/vresampler.cc 2012-10-26 22:58:55.000000000 +0200 +++ zita-resampler-1.3.0/libs/vresampler.cc 2015-11-15 12:27:58.424544882 +0100 @@ -25,6 +25,58 @@ #include <zita-resampler/vresampler.h> +#ifdef __SSE2__ + +#include <xmmintrin.h> + +static inline void calc_stereo_sample_sse (int hl, + float b, + float *p1, + float *p2, + float *q1, + float *q2, + float *out_data) +{ + int i; + __m128 denorm, bs, s, c1, c2, w1, w2; + + denorm = _mm_set1_ps (1e-25f); + bs = _mm_set1_ps (b); + s = denorm; + for (i = 0; i < hl; i += 4) + { + p2 -= 8; + + // _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]); + w1 = _mm_loadu_ps (&q1 [i]); + w2 = _mm_loadu_ps (&q1 [i + hl]); + c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1))); + + // _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]); + w1 = _mm_loadu_ps (&q2 [i]); + w2 = _mm_loadu_ps (&q2 [i - hl]); + c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1))); + + // s += *p1 * _c1 [i]; + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1), _mm_unpacklo_ps (c1, c1))); + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 4), _mm_unpackhi_ps (c1, c1))); + + // s += *p2 * _c2 [i]; + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 4), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 0, 1, 1)))); + s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (2, 2, 3, 3)))); + + p1 += 8; + } + s = _mm_sub_ps (s, denorm); + s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2))); + + // Writes two bytes more than we want, but this is fine since out_count >= 2. + _mm_storeu_ps (out_data, s); +} + +#endif + + VResampler::VResampler (void) : _table (0), _nchan (0), @@ -212,23 +264,33 @@ a = 1.0f - b; q1 = _table->_ctab + hl * k; q2 = _table->_ctab + hl * (np - k); - for (i = 0; i < hl; i++) +#ifdef __SSE2__ + if ((hl % 4) == 0 && _nchan == 2 && out_count >= 2) { - _c1 [i] = a * q1 [i] + b * q1 [i + hl]; - _c2 [i] = a * q2 [i] + b * q2 [i - hl]; + calc_stereo_sample_sse (hl, b, p1, p2, q1, q2, out_data); + out_data += 2; } - for (c = 0; c < _nchan; c++) + else +#endif { - q1 = p1 + c; - q2 = p2 + c; - a = 1e-25f; - for (i = 0; i < hl; i++) - { - q2 -= _nchan; - a += *q1 * _c1 [i] + *q2 * _c2 [i]; - q1 += _nchan; - } - *out_data++ = a - 1e-25f; + for (i = 0; i < hl; i++) + { + _c1 [i] = a * q1 [i] + b * q1 [i + hl]; + _c2 [i] = a * q2 [i] + b * q2 [i - hl]; + } + for (c = 0; c < _nchan; c++) + { + q1 = p1 + c; + q2 = p2 + c; + a = 1e-25f; + for (i = 0; i < hl; i++) + { + q2 -= _nchan; + a += *q1 * _c1 [i] + *q2 * _c2 [i]; + q1 += _nchan; + } + *out_data++ = a - 1e-25f; + } } } else