On Mon, Aug 29, 2016 at 06:58:24PM +0000, Fons Adriaensen wrote:
> I wil not accept the patch in its current form, but OTOH the
> code is too good to just be ignored, so I will integrate it
> in another way.
> 
> For the next release of zita-resampler I will reorganise the
> code a bit, so it will be possible to have separate optimised
> Resampler1,2,4 classes (for 1,2,4 channels respectively) using
> the SSE code, and without too much code duplication. Same for
> Vresampler.
> 
> So Steinar, could you provide optimised 1 and 4 chan versions
> as well ? Even better would be if the latter could handle any
> multiple of 4 channels. In all cases you may assume (hlen % 4 == 0). 

Hi Fons,

I expanded on the patch; this new version supports 1, 2 and multiples of 4
channels, and it no longer relies on the serial code (when I can't write past
the end, I just write to a temporary buffer and copy out from there).
Of course, you'll need to reorganize to fit the new structure once you have
it, but hopefully, that should be simple.

The code is pretty much straight-up; the multiples-of-4 VResampler
version isn't optimal for 4 nor for multiples-of-4, but it should be a
reasonable compromise between the two. I guess that if multiples-of-4
is the more important case, we should go to storing the coefficients
(like the scalar version does) instead of computing them anew for each
group of four. (Except for hlen <= 32, in which case it probably would
be optimal to keep them in registers, but I'm not writing special code
for that :-) )

I didn't write AVX versions yet because they would need function
multiversioning to be useful, and in the current structure, that would
mean quite a lot of duplicated code. (Unfortunately, you can't multiversion
inlined functions yet.)

I wrote some test code to make sure I didn't mess up. My original test for
this was based on resampling noise and comparing it to a reference rendering,
but this required my entire audio pipeline running, so I made a simpler one
that simply resamples a sine and compares to another sine. It is simplistic,
but catches most kinds of SSE-ification mistakes instantly, so I've included
it in case you find it useful. Consider both the patch and the test file
licensed under GPLv3+ -- let me know if you want some other kind of license.

/* Steinar */
-- 
Homepage: https://www.sesse.net/
// zita-resampler tests; meant only to verify correctness of
// e.g. SSE optimizations, not as a means of comparing quality
// of different resamplers, or as exhaustive tests.

#include <zita-resampler/resampler.h>
#include <zita-resampler/vresampler.h>
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <vector>

using namespace std;

constexpr int in_freq = 44100;
constexpr int out_freq = 48000;
constexpr int samples_per_block = 137;  // Prime.

vector<float> make_freqs(unsigned num_channels)
{
	vector<float> ret;
	for (unsigned i = 0; i < num_channels; ++i) {
		ret.push_back(440 - i);
	}
	return ret;
}

void setup_resampler(VResampler *resampler, unsigned in_freq, unsigned out_freq, unsigned num_channels, float rratio)
{
	resampler->setup(double(out_freq) / double(in_freq), num_channels, /*hlen=*/32);
	resampler->set_rratio(rratio);
}

void setup_resampler(Resampler *resampler, unsigned in_freq, unsigned out_freq, unsigned num_channels, float rratio)
{
	resampler->setup(in_freq, out_freq, num_channels, /*hlen=*/32);
	assert(rratio == 1.0f);
}

void print_result(VResampler *resampler, unsigned num_channels, float rratio, float max_err_db, float rms_err_db)
{
	printf("VResampler test: %u channel(s), rratio=%.2f   max error = %+7.2f dB   RMS error = %+7.2f dB\n",
		num_channels, rratio, max_err_db, rms_err_db);
}

void print_result(Resampler *resampler, unsigned num_channels, float rratio, float max_err_db, float rms_err_db)
{
	printf("Resampler test:  %u channel(s)                max error = %+7.2f dB   RMS error = %+7.2f dB\n",
		num_channels, max_err_db, rms_err_db);
}

template<class T>
void test_resampler(int num_channels, float rratio = 1.0f)
{
	vector<float> freqs = make_freqs(num_channels);
	T resampler;
	setup_resampler(&resampler, in_freq, out_freq, num_channels, rratio);

	const int initial_delay = resampler.inpsize() / 2 - 1;
	vector<float> in_phaser_speeds, out_phaser_speeds;
	for (unsigned i = 0; i < num_channels; ++i) {
		in_phaser_speeds.push_back(2.0 * M_PI * freqs[i] / in_freq);
		out_phaser_speeds.push_back(2.0 * M_PI * freqs[i] / (out_freq * rratio));
	}

	float max_err = 0.0f, sum_sq_err = 0.0f;

	unsigned num_output_samples = 0;
	vector<float> in, out;
	out.resize(samples_per_block * num_channels * 2);  // Plenty.
	for (unsigned block = 0; block < 10; ++block) {
		in.clear();
		for (unsigned i = 0; i < samples_per_block; ++i) {
			int sample_num = block * samples_per_block + i;
			for (unsigned channel = 0; channel < num_channels; ++channel) {
				in.push_back(sin((sample_num - initial_delay) * in_phaser_speeds[channel]));
			}
		}
		resampler.inp_count = in.size() / num_channels;
		resampler.inp_data = &in[0];
		resampler.out_count = out.size() / num_channels;
		resampler.out_data = &out[0];
		resampler.process();
		assert(resampler.inp_count == 0);
		assert(resampler.out_count > 0);
		for (unsigned i = 0; i < out.size() / num_channels - resampler.out_count; ++i) {
			for (unsigned channel = 0; channel < num_channels; ++channel) {
				const float val = out[i * num_channels + channel];
				const float ref = sin(num_output_samples * out_phaser_speeds[channel]);
				const float err = val - ref;
				max_err = max(max_err, fabs(err));
				sum_sq_err += err * err;
			}
			++num_output_samples;
		}
	}

	float rms_err = sqrt(sum_sq_err / num_output_samples);
	print_result(&resampler, num_channels, rratio, 20.0 * log10(max_err), 20.0 * log10(rms_err));
}

int main(void)
{
	for (unsigned num_channels : {1, 2, 3, 4, 8}) {
		test_resampler<Resampler>(num_channels);
		for (float rratio : { 0.99, 1.0, 1.01 }) {
			test_resampler<VResampler>(num_channels, rratio);
		}
		printf("\n");
	}
}
diff -ur orig/zita-resampler-1.3.0/libs/resampler.cc zita-resampler-1.3.0/libs/resampler.cc
--- orig/zita-resampler-1.3.0/libs/resampler.cc	2012-10-26 22:58:55.000000000 +0200
+++ zita-resampler-1.3.0/libs/resampler.cc	2016-09-05 00:30:34.520191288 +0200
@@ -24,6 +24,10 @@
 #include <math.h>
 #include <zita-resampler/resampler.h>
 
+#ifdef __SSE2__
+#include <xmmintrin.h>
+#endif
+
 
 static unsigned int gcd (unsigned int a, unsigned int b)
 {
@@ -47,6 +51,118 @@
     return 1; 
 }
 
+#ifdef __SSE2__
+
+static inline float calc_mono_sample_sse (unsigned int hl,
+                                          const float *c1,
+                                          const float *c2,
+                                          const float *q1,
+                                          const float *q2)
+{
+    unsigned int   i;
+    __m128         denorm, s, w1, w2, shuf;
+
+    denorm = _mm_set1_ps (1e-20f);
+    s = denorm;
+    for (i = 0; i < hl; i += 4)
+    {
+	q2 -= 4;
+
+	// s += *q1 * c1 [i];
+	w1 = _mm_loadu_ps (&c1 [i]);
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1), w1));
+
+	// s += *q2 * c2 [i];
+	w2 = _mm_loadu_ps (&c2 [i]);
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (0, 1, 2, 3))));
+
+	q1 += 4;
+    }
+    s = _mm_sub_ps (s, denorm);
+
+    // Add all the elements of s together into one. Adapted from
+    // http://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86
+    shuf = _mm_shuffle_ps (s, s, _MM_SHUFFLE (2, 3, 0, 1));
+    s = _mm_add_ps (s, shuf);
+    s = _mm_add_ss (s, _mm_movehl_ps (shuf, s));
+    return _mm_cvtss_f32 (s);
+}
+
+// Note: This writes four floats instead of two (the last two are garbage).
+// The caller will need to make sure there is room for all four.
+static inline void calc_stereo_sample_sse (unsigned int hl,
+                                           const float *c1,
+                                           const float *c2,
+                                           const float *q1,
+                                           const float *q2,
+                                           float *out_data)
+{
+    unsigned int   i;
+    __m128         denorm, s, w1, w2;
+
+    denorm = _mm_set1_ps (1e-20f);
+    s = denorm;
+    for (i = 0; i < hl; i += 4)
+    {
+	q2 -= 8;
+
+	// s += *q1 * c1 [i];
+	w1 = _mm_loadu_ps (&c1 [i]);
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1),     _mm_unpacklo_ps (w1, w1)));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + 4), _mm_unpackhi_ps (w1, w1)));
+
+	// s += *q2 * c2 [i];
+	w2 = _mm_loadu_ps (&c2 [i]);
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + 4), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (0, 0, 1, 1))));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2),     _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (2, 2, 3, 3))));
+
+	q1 += 8;
+    }
+    s = _mm_sub_ps (s, denorm);
+    s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2)));
+
+    _mm_storeu_ps (out_data, s);
+}
+
+static inline void calc_quad_sample_sse (int hl,
+                                         int nchan,
+                                         const float *c1,
+                                         const float *c2,
+                                         const float *q1,
+                                         const float *q2,
+                                         float *out_data)
+{
+    int            i;
+    __m128         denorm, s, w1, w2;
+
+    denorm = _mm_set1_ps (1e-20f);
+    s = denorm;
+    for (i = 0; i < hl; i += 4)
+    {
+	q2 -= 4 * nchan;
+
+	// s += *p1 * _c1 [i];
+	w1 = _mm_loadu_ps (&c1 [i]);
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1),             _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (0, 0, 0, 0))));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + nchan),     _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (1, 1, 1, 1))));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + 2 * nchan), _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (2, 2, 2, 2))));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q1 + 3 * nchan), _mm_shuffle_ps (w1, w1, _MM_SHUFFLE (3, 3, 3, 3))));
+
+	// s += *p2 * _c2 [i];
+	w2 = _mm_loadu_ps (&c2 [i]);
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + 3 * nchan), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (0, 0, 0, 0))));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + 2 * nchan), _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (1, 1, 1, 1))));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2 + nchan),     _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (2, 2, 2, 2))));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (q2),             _mm_shuffle_ps (w2, w2, _MM_SHUFFLE (3, 3, 3, 3))));
+
+	q1 += 4 * nchan;
+    }
+    s = _mm_sub_ps (s, denorm);
+
+    _mm_storeu_ps (out_data, s);
+}
+#endif
+
 
 Resampler::Resampler (void) :
     _table (0),
@@ -213,18 +329,42 @@
 		{
 		    float *c1 = _table->_ctab + hl * ph;
 		    float *c2 = _table->_ctab + hl * (np - ph);
-		    for (c = 0; c < _nchan; c++)
+#ifdef __SSE2__
+		    if ((hl % 4) == 0 && _nchan == 1)
+                    {
+			*out_data++ = calc_mono_sample_sse (hl, c1, c2, p1, p2);
+                    }
+		    else if ((hl % 4) == 0 && _nchan == 2)
 		    {
-			float *q1 = p1 + c;
-			float *q2 = p2 + c;
-			float s = 1e-20f;
-			for (i = 0; i < hl; i++)
+                        if (out_count >= 2)
+                        {
+			    calc_stereo_sample_sse (hl, c1, c2, p1, p2, out_data);
+                        }
+                        else
+                        {
+                            float tmp[4];
+			    calc_stereo_sample_sse (hl, c1, c2, p1, p2, tmp);
+                            out_data[0] = tmp[0];
+                            out_data[1] = tmp[1];
+                        }
+			out_data += 2;
+		    }
+		    else
+#endif
+                    {
+			for (c = 0; c < _nchan; c++)
 			{
-			    q2 -= _nchan;
-			    s += *q1 * c1 [i] + *q2 * c2 [i];
-			    q1 += _nchan;
+			    float *q1 = p1 + c;
+			    float *q2 = p2 + c;
+			    float s = 1e-20f;
+			    for (i = 0; i < hl; i++)
+			    {
+				q2 -= _nchan;
+				s += *q1 * c1 [i] + *q2 * c2 [i];
+				q1 += _nchan;
+			    }
+			    *out_data++ = s - 1e-20f;
 			}
-			*out_data++ = s - 1e-20f;
 		    }
 		}
 		else
diff -ur orig/zita-resampler-1.3.0/libs/vresampler.cc zita-resampler-1.3.0/libs/vresampler.cc
--- orig/zita-resampler-1.3.0/libs/vresampler.cc	2012-10-26 22:58:55.000000000 +0200
+++ zita-resampler-1.3.0/libs/vresampler.cc	2016-09-05 00:33:53.907511211 +0200
@@ -25,6 +25,152 @@
 #include <zita-resampler/vresampler.h>
 
 
+#ifdef __SSE2__
+
+#include <xmmintrin.h>
+
+static inline float calc_mono_sample_sse (int hl,
+                                          float b,
+                                          const float *p1,
+                                          const float *p2,
+                                          const float *q1,
+                                          const float *q2)
+{
+    int            i;
+    __m128         denorm, bs, s, c1, c2, w1, w2, shuf;
+
+    denorm = _mm_set1_ps (1e-25f);
+    bs = _mm_set1_ps (b);
+    s = denorm;
+    for (i = 0; i < hl; i += 4)
+    {
+	p2 -= 4;
+
+	// _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]);
+	w1 = _mm_loadu_ps (&q1 [i]);
+	w2 = _mm_loadu_ps (&q1 [i + hl]);
+	c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
+
+	// _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]);
+	w1 = _mm_loadu_ps (&q2 [i]);
+	w2 = _mm_loadu_ps (&q2 [i - hl]);
+	c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
+
+	// s += *p1 * _c1 [i];
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1), c1));
+
+	// s += *p2 * _c2 [i];
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 1, 2, 3))));
+
+	p1 += 4;
+    }
+    s = _mm_sub_ps (s, denorm);
+
+    // Add all the elements of s together into one. Adapted from
+    // http://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86
+    shuf = _mm_shuffle_ps (s, s, _MM_SHUFFLE (2, 3, 0, 1));
+    s = _mm_add_ps (s, shuf);
+    s = _mm_add_ss (s, _mm_movehl_ps (shuf, s));
+    return _mm_cvtss_f32 (s);
+}
+
+// Note: This writes four floats instead of two (the last two are garbage).
+// The caller will need to make sure there is room for all four.
+static inline void calc_stereo_sample_sse (int hl,
+                                           float b,
+                                           const float *p1,
+                                           const float *p2,
+                                           const float *q1,
+                                           const float *q2,
+                                           float *out_data)
+{
+    int            i;
+    __m128         denorm, bs, s, c1, c2, w1, w2;
+
+    denorm = _mm_set1_ps (1e-25f);
+    bs = _mm_set1_ps (b);
+    s = denorm;
+    for (i = 0; i < hl; i += 4)
+    {
+	p2 -= 8;
+
+	// _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]);
+	w1 = _mm_loadu_ps (&q1 [i]);
+	w2 = _mm_loadu_ps (&q1 [i + hl]);
+	c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
+
+	// _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]);
+	w1 = _mm_loadu_ps (&q2 [i]);
+	w2 = _mm_loadu_ps (&q2 [i - hl]);
+	c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
+
+	// s += *p1 * _c1 [i];
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1),     _mm_unpacklo_ps (c1, c1)));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 4), _mm_unpackhi_ps (c1, c1)));
+
+	// s += *p2 * _c2 [i];
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 4), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 0, 1, 1))));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2),     _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (2, 2, 3, 3))));
+
+	p1 += 8;
+    }
+    s = _mm_sub_ps (s, denorm);
+    s = _mm_add_ps (s, _mm_shuffle_ps (s, s, _MM_SHUFFLE (1, 0, 3, 2)));
+
+    _mm_storeu_ps (out_data, s);
+}
+
+static inline void calc_quad_sample_sse (int hl,
+                                         int nchan,
+                                         float b,
+                                         const float *p1,
+                                         const float *p2,
+                                         const float *q1,
+                                         const float *q2,
+                                         float *out_data)
+{
+    int            i;
+    __m128         denorm, bs, s, c1, c2, w1, w2;
+
+    denorm = _mm_set1_ps (1e-25f);
+    bs = _mm_set1_ps (b);
+    s = denorm;
+    for (i = 0; i < hl; i += 4)
+    {
+	p2 -= 4 * nchan;
+
+	// _c1 [i] = q1 [i] + b * (q1 [i + hl] - q1 [i]);
+	w1 = _mm_loadu_ps (&q1 [i]);
+	w2 = _mm_loadu_ps (&q1 [i + hl]);
+	c1 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
+
+	// _c2 [i] = q2 [i] + b * (q2 [i - hl] - q2 [i]);
+	w1 = _mm_loadu_ps (&q2 [i]);
+	w2 = _mm_loadu_ps (&q2 [i - hl]);
+	c2 = _mm_add_ps (w1, _mm_mul_ps(bs, _mm_sub_ps (w2, w1)));
+
+	// s += *p1 * _c1 [i];
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1),             _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (0, 0, 0, 0))));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + nchan),     _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (1, 1, 1, 1))));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 2 * nchan), _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (2, 2, 2, 2))));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p1 + 3 * nchan), _mm_shuffle_ps (c1, c1, _MM_SHUFFLE (3, 3, 3, 3))));
+
+	// s += *p2 * _c2 [i];
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 3 * nchan), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (0, 0, 0, 0))));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + 2 * nchan), _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (1, 1, 1, 1))));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2 + nchan),     _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (2, 2, 2, 2))));
+	s = _mm_add_ps (s, _mm_mul_ps (_mm_loadu_ps (p2),             _mm_shuffle_ps (c2, c2, _MM_SHUFFLE (3, 3, 3, 3))));
+
+	p1 += 4 * nchan;
+    }
+    s = _mm_sub_ps (s, denorm);
+
+    _mm_storeu_ps (out_data, s);
+}
+
+#endif
+
+
 VResampler::VResampler (void) :
     _table (0),
     _nchan (0),
@@ -163,7 +309,7 @@
 
 int VResampler::process (void)
 {
-    unsigned int   k, np, in, nr, n, c;
+    unsigned int   j, k, np, in, nr, n, c;
     int            i, hl, nz;
     double         ph, dp, dd; 
     float          a, b, *p1, *p2, *q1, *q2;
@@ -212,23 +358,55 @@
 		    a = 1.0f - b;
 		    q1 = _table->_ctab + hl * k;
 		    q2 = _table->_ctab + hl * (np - k);
-     		    for (i = 0; i < hl; i++)
+#ifdef __SSE2__
+		    if ((hl % 4) == 0 && _nchan == 1)
+		    {
+			*out_data++ = calc_mono_sample_sse (hl, b, p1, p2, q1, q2);
+		    }
+		    else if ((hl % 4) == 0 && _nchan == 2)
 		    {
-                        _c1 [i] = a * q1 [i] + b * q1 [i + hl];
-    		        _c2 [i] = a * q2 [i] + b * q2 [i - hl];
+			if (out_count >= 2)
+			{
+			    calc_stereo_sample_sse (hl, b, p1, p2, q1, q2, out_data);
+			}
+			else
+			{
+			    float tmp[4];
+			    calc_stereo_sample_sse (hl, b, p1, p2, q1, q2, tmp);
+			    out_data[0] = tmp[0];
+			    out_data[1] = tmp[1];
+			}
+			out_data += 2;
+		    }
+		    else if ((hl % 4) == 0 && (_nchan % 4) == 0)
+		    {
+			for (j = 0; j < _nchan; j += 4)
+			{
+			    calc_quad_sample_sse (hl, _nchan, b, p1 + j, p2 + j, q1, q2, out_data + j);
+			}
+			out_data += _nchan;
 		    }
-		    for (c = 0; c < _nchan; c++)
+		    else
+#endif
 		    {
-			q1 = p1 + c;
-			q2 = p2 + c;
-			a = 1e-25f;
 			for (i = 0; i < hl; i++)
 			{
-			    q2 -= _nchan;
-			    a += *q1 * _c1 [i] + *q2 * _c2 [i];
-			    q1 += _nchan;
+			    _c1 [i] = a * q1 [i] + b * q1 [i + hl];
+			    _c2 [i] = a * q2 [i] + b * q2 [i - hl];
+			}
+			for (c = 0; c < _nchan; c++)
+			{
+			    q1 = p1 + c;
+			    q2 = p2 + c;
+			    a = 1e-25f;
+			    for (i = 0; i < hl; i++)
+			    {
+				q2 -= _nchan;
+				a += *q1 * _c1 [i] + *q2 * _c2 [i];
+				q1 += _nchan;
+			    }
+			    *out_data++ = a - 1e-25f;
 			}
-			*out_data++ = a - 1e-25f;
 		    }
 		}
 		else

Reply via email to