Bug#832095: patch for SSE-optimizing resampling of stereo signals

Steinar H. Gunderson Fri, 22 Jul 2016 02:39:26 -0700

Package: libzita-resampler1
Version: 1.3.0-2
Severity: wishlist
Tags: upstream patch


Hi,

Please find attached a patch for SSE-optimizing resampling of stereo signals;
it makes this more or less three times as fast on Intel systems, without
sacrificing any quality.

I talked to upstream about this patch back in the day, and he seemed happy to
accept it, but somehow just stopped answering -- I guess he got busy with other
things in life. It would be nice if we could get it into Debian nevertheless
(I want it for reducing CPU used in my realtime video mixer).

It is taken to be by Steinar H. Gunderson <se...@google.com> (ie., work hat
from my ex-job), and licensed under the same terms as zita-resampler itself
(ie., GPLv3+).

-- System Information:
Debian Release: stretch/sid
  APT prefers unstable
  APT policy: (500, 'unstable'), (500, 'testing'), (1, 'experimental')
Architecture: amd64 (x86_64)
Foreign Architectures: i386, armhf

Kernel: Linux 4.7.0-rc7 (SMP w/4 CPU cores)
Locale: LANG=nb_NO.utf8, LC_CTYPE=nb_NO.utf8 (charmap=UTF-8)
Shell: /bin/sh linked to /bin/dash
Init: systemd (via /run/systemd/system)

Versions of packages libzita-resampler1 depends on:
ii  libc6              2.23-2
ii  libgcc1            1:6.1.1-9
ii  libstdc++6         6.1.1-9
ii  multiarch-support  2.23-2

libzita-resampler1 recommends no packages.

libzita-resampler1 suggests no packages.

-- no debconf information

diff -ur orig/zita-resampler-1.3.0/libs/resampler.cc zita-resampler-1.3.0/libs/resampler.cc
--- orig/zita-resampler-1.3.0/libs/resampler.cc	2012-10-26 22:58:55.000000000 +0200
+++ zita-resampler-1.3.0/libs/resampler.cc	2015-11-15 12:27:42.764591015 +0100
@@ -24,6 +24,10 @@
 #include <math.h>
 #include <zita-resampler/resampler.h>
 
+#ifdef __SSE2__
+#include <xmmintrin.h>
+#endif
+
 
 static unsigned int gcd (unsigned int a, unsigned int b)
 {
@@ -47,6 +51,45 @@
     return 1; 
 }
 
+#ifdef __SSE2__
+
+static inline void calc_stereo_sample_sse (unsigned int hl,
+                                           float *c1,
+                                           float *c2,
+                                           float *q1,
+                                           float *q2,
+                                           float *out_data)
+{
+    unsigned int   i;
+    __m128         denorm, s, w1, w2;
+
+    denorm = _mm_set1_ps (1e-20f);
+    s = denorm;
+    for (i = 0; i < hl; i += 4)
+    {
+	q2 -= 8;
+
+	// s += *q1 * c1 [i];
+	w1 = _mm_loadu_ps (&c1 [i]);
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1),     _mm_unpacklo_ps (w1, w1)));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + 4), _mm_unpackhi_ps (w1, w1)));
+
+	// s += *q2 * c2 [i];
+	w2 = _mm_loadu_ps (&c2 [i]);
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + 4), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (0, 0, 1, 1))));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2),     _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (2, 2, 3, 3))));
+
+	q1 += 8;
+    }
+    s = _mm_sub_ps (s, denorm);
+    s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2)));
+
+    // Writes two bytes more than we want, but this is fine since out_count >= 2.
+    _mm_storeu_ps (out_data, s);
+}
+
+#endif
+
 
 Resampler::Resampler (void) :
     _table (0),
@@ -213,18 +256,28 @@
 		{
 		    float *c1 = _table->_ctab + hl * ph;
 		    float *c2 = _table->_ctab + hl * (np - ph);
-		    for (c = 0; c < _nchan; c++)
+#ifdef __SSE2__
+		    if ((hl % 4) == 0 && _nchan == 2 && out_count >= 2)
 		    {
-			float *q1 = p1 + c;
-			float *q2 = p2 + c;
-			float s = 1e-20f;
-			for (i = 0; i < hl; i++)
+			calc_stereo_sample_sse (hl, c1, c2, p1, p2, out_data);
+			out_data += 2;
+		    }
+		    else
+#endif
+                    {
+			for (c = 0; c < _nchan; c++)
 			{
-			    q2 -= _nchan;
-			    s += *q1 * c1 [i] + *q2 * c2 [i];
-			    q1 += _nchan;
+			    float *q1 = p1 + c;
+			    float *q2 = p2 + c;
+			    float s = 1e-20f;
+			    for (i = 0; i < hl; i++)
+			    {
+				q2 -= _nchan;
+				s += *q1 * c1 [i] + *q2 * c2 [i];
+				q1 += _nchan;
+			    }
+			    *out_data++ = s - 1e-20f;
 			}
-			*out_data++ = s - 1e-20f;
 		    }
 		}
 		else
@@ -260,4 +313,3 @@
     return 0;
 }
 
-
diff -ur orig/zita-resampler-1.3.0/libs/vresampler.cc zita-resampler-1.3.0/libs/vresampler.cc
--- orig/zita-resampler-1.3.0/libs/vresampler.cc	2012-10-26 22:58:55.000000000 +0200
+++ zita-resampler-1.3.0/libs/vresampler.cc	2015-11-15 12:27:58.424544882 +0100
@@ -25,6 +25,58 @@
 #include <zita-resampler/vresampler.h>
 
 
+#ifdef __SSE2__
+
+#include <xmmintrin.h>
+
+static inline void calc_stereo_sample_sse (int hl,
+                                           float b,
+                                           float *p1,
+                                           float *p2,
+                                           float *q1,
+                                           float *q2,
+                                           float *out_data)
+{
+    int            i;
+    __m128         denorm, bs, s, c1, c2, w1, w2;
+
+    denorm = _mm_set1_ps (1e-25f);
+    bs = _mm_set1_ps (b);
+    s = denorm;
+    for (i = 0; i < hl; i += 4)
+    {
+	p2 -= 8;
+
+	// _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]);
+	w1 = _mm_loadu_ps (&q1 [i]);
+	w2 = _mm_loadu_ps (&q1 [i + hl]);
+	c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
+
+	// _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]);
+	w1 = _mm_loadu_ps (&q2 [i]);
+	w2 = _mm_loadu_ps (&q2 [i - hl]);
+	c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
+
+	// s += *p1 * _c1 [i];
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1),     _mm_unpacklo_ps (c1, c1)));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 4), _mm_unpackhi_ps (c1, c1)));
+
+	// s += *p2 * _c2 [i];
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 4), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 0, 1, 1))));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2),     _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (2, 2, 3, 3))));
+
+	p1 += 8;
+    }
+    s = _mm_sub_ps (s, denorm);
+    s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2)));
+
+    // Writes two bytes more than we want, but this is fine since out_count >= 2.
+    _mm_storeu_ps (out_data, s);
+}
+
+#endif
+
+
 VResampler::VResampler (void) :
     _table (0),
     _nchan (0),
@@ -212,23 +264,33 @@
 		    a = 1.0f - b;
 		    q1 = _table->_ctab + hl * k;
 		    q2 = _table->_ctab + hl * (np - k);
-     		    for (i = 0; i < hl; i++)
+#ifdef __SSE2__
+		    if ((hl % 4) == 0 && _nchan == 2 && out_count >= 2)
 		    {
-                        _c1 [i] = a * q1 [i] + b * q1 [i + hl];
-    		        _c2 [i] = a * q2 [i] + b * q2 [i - hl];
+			calc_stereo_sample_sse (hl, b, p1, p2, q1, q2, out_data);
+			out_data += 2;
 		    }
-		    for (c = 0; c < _nchan; c++)
+		    else
+#endif
 		    {
-			q1 = p1 + c;
-			q2 = p2 + c;
-			a = 1e-25f;
-			for (i = 0; i < hl; i++)
-			{
-			    q2 -= _nchan;
-			    a += *q1 * _c1 [i] + *q2 * _c2 [i];
-			    q1 += _nchan;
-			}
-			*out_data++ = a - 1e-25f;
+		        for (i = 0; i < hl; i++)
+		        {
+		            _c1 [i] = a * q1 [i] + b * q1 [i + hl];
+		            _c2 [i] = a * q2 [i] + b * q2 [i - hl];
+		        }
+		        for (c = 0; c < _nchan; c++)
+		        {
+		            q1 = p1 + c;
+		            q2 = p2 + c;
+		            a = 1e-25f;
+		            for (i = 0; i < hl; i++)
+		            {
+		                q2 -= _nchan;
+		                a += *q1 * _c1 [i] + *q2 * _c2 [i];
+		                q1 += _nchan;
+		            }
+		            *out_data++ = a - 1e-25f;
+		        }
 		    }
 		}
 		else

Bug#832095: patch for SSE-optimizing resampling of stereo signals

Reply via email to