Ok, here's the code, I've got it down to about 800 milliseconds for a
2^20 double[] -> Complex!double[] transform. In comparison, the R
statistics package, which is probably very heavily optimized and uses (I
think) GCC as its compiler can do a similar transform in ~610
milliseconds, so my code isn't blindingly fast, but it's not
unreasonably slow.
On 8/1/2010 5:21 PM, Jason Spencer wrote:
I don't have much of an opinion on adding to Phobos, but I'd definitely be
interested in seeing the code. I'm starting to do some scientific computing
using D, and that would be a great learning reference for me.
Jason
----- Original Message ----
From: David Simcha<[email protected]>
To: Discuss the phobos library for D<[email protected]>
Sent: Sun, August 1, 2010 1:32:38 PM
Subject: Re: [phobos] FFT
Speed-wise, I've just been goofing around for the past hour or so and I've sped
it up 2x. It now does 1 size 2 ^^ 20 double[] -> Complex!double FFT in about
880 milliseconds.
On 8/1/2010 3:36 PM, Walter Bright wrote:
David Simcha wrote:
As I am no longer going to use FFTs in my kernel density lib, improving
this FFT code will be bumped down my hacking to-do list. Does what I have now
sound better than nothing by a large enough margin to warrant inclusion in
std.numeric, or does it sound too primitive to be widely useful? If it sounds
worth including I'll clean up/document the code and send it to the mailing list
for review. If it sounds too primitive, I'll just scrap it.
I don't know enough about FFT's to make any sort of informed comment.
_______________________________________________
phobos mailing list
[email protected]
http://lists.puremagic.com/mailman/listinfo/phobos
_______________________________________________
phobos mailing list
[email protected]
http://lists.puremagic.com/mailman/listinfo/phobos
/**This library implements a basic power-of-two fast Fourier transform (FFT)
* and inverse FFT in relatively simple, readable, portable D code. It's
* not blindingly fast, but it's not unreasonably slow either. A size 2 ^^ 20
* double[] -> Complex!double FFT takes about 790 milliseconds on my Athlon XP
* 5200+. For comparison, the (probably heavily optimized and compiled with a
* compiler that's better at optimizing floating point arithmetic) R statistics
* package does it in about 610 milliseconds on the same machine.
*
* The algorithm used under the hood is the radix-2 Cooley-Tukey algorithm.
*
* References: http://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm
*
* Copyright (C) 2010 David Simcha
*
* License:
* Boost Software License - Version 1.0 - August 17th, 2003
*
* Permission is hereby granted, free of charge, to any person or organization
* obtaining a copy of the software and accompanying documentation covered by
* this license (the "Software") to use, reproduce, display, distribute,
* execute, and transmit the Software, and to prepare derivative works of the
* Software, and to permit third-parties to whom the Software is furnished to
* do so, all subject to the following:
*
* The copyright notices in the Software and this entire statement, including
* the above license grant, this restriction and the following disclaimer,
* must be included in all copies of the Software, in whole or in part, and
* all derivative works of the Software, unless such copies or derivative
* works are solely in the form of machine-executable object code generated by
* a source language processor.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
* SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
* FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
import std.math, std.stdio, std.contracts, std.range,
std.perf, std.algorithm, std.complex, std.traits, std.random,
std.intrinsic;
version(unittest) {
void main() {}
}
void main() { doTiming(); }
void doTiming() {
auto arr = new double[2 ^^ 20];
foreach(ref elem; arr) {
elem = uniform(-1.0, 1.0);
}
auto pc = new PerformanceCounter;
pc.start;
auto foo = fft(arr);
pc.stop;
writeln(pc.milliseconds, " ms for forward transform.");
pc.start;
auto bar = inverseFft(foo);
pc.stop;
writeln(pc.milliseconds, " ms for inverse transform.");
enforce(approxEqual(map!"a.re"(bar), arr));
writeln("Inverse is approx. equal original.");
}
/**Compute the Fourier transform of range using the O(N log N) Cooley-Tukey
* Algorithm. range must be a random-access range with a size
* that is a power of two. The contents of range can be either numeric types,
* which will be interpreted as pure real values, or complex types with
* properties or members .re and .im that can be read.
*
* Returns: An array of complex numbers representing the transformed data in
* the frequency domain.
*/
Complex!F[] fft(F = double, R)(R range) {
enforce(isPowerOfTwo(range.length),
"Can only do FFTs on ranges with a size that is a power of two.");
Complex!F[] ret;
if(range.length == 0) {
return ret;
}
ret.length = range.length;
fft(range, ret);
return ret;
}
/**Same as the overload, but allows for the results to be stored in a user-
* provided buffer. The buffer must be of the same length as range, must be
* a random-access range, must have slicing, and must contain elements that are
* complex-like. This means that they must have a .re and a .im member or
* property that can be both read and written and is a floating point number.
*/
void fft(Ret, R)(R range, Ret buf)
if(isRandomAccessRange!Ret && isComplexLike!(ElementType!Ret) &&
hasSlicing!Ret) {
enforce(buf.length == range.length);
enforce(isPowerOfTwo(range.length),
"Can only do FFTs on ranges with a size that is a power of two.");
static if(is(R : Stride!R)) {
return fftImpl(range, buf);
} else {
return fftImpl(Stride!R(range, 1), buf);
}
}
/**Computes the inverse Fourier transform of a range. The range must be a
* random access range, have a power of two size, and contain
* elements that are either of type std.complex.Complex or have essentially
* the same compile-time interface.
*
* Returns: The time-domain signal.
*/
Complex!F[] inverseFft(F = double, R)(R range)
if(isRandomAccessRange!R && isComplexLike!(ElementType!R)) {
enforce(isPowerOfTwo(range.length),
"Can only do inverse FFTs on ranges with a size that is a power of
two.");
Complex!F[] ret;
if(range.length == 0) {
return ret;
}
ret.length = range.length;
inverseFft(range, ret);
return ret;
}
/**Inverse FFT that allows a user-supplied buffer to be provided. The buffer
* must be a random access range with slicing, and its elements
* must be some complex-like type.
*/
void inverseFft(Ret, R)(R range, Ret buf)
if(isRandomAccessRange!Ret && isComplexLike!(ElementType!Ret) &&
hasSlicing!Ret) {
enforce(isPowerOfTwo(range.length),
"Can only do inverse FFTs on ranges with a size that is a power of
two.");
// BUGS: This should be done w/o copying to an array first. On the next
// version of DMD this will be do-able easily because new and improved Map
// will be in std.algorithm.
auto swapped = array(map!swapRealImag(range));
fft(swapped, buf);
foreach(ref elem; buf) {
elem = swapRealImag(elem) / buf.length;
}
}
unittest {
// Test values from R.
auto arr = [1,2,3,4,5,6,7,8];
auto fft1 = fft(arr);
assert(approxEqual(map!"a.re"(fft1),
[36.0, -4, -4, -4, -4, -4, -4, -4]));
assert(approxEqual(map!"a.im"(fft1),
[0, 9.6568, 4, 1.6568, 0, -1.6568, -4, -9.6568]));
alias Complex!float C;
auto arr2 = [C(1,2), C(3,4), C(5,6), C(7,8), C(9,10),
C(11,12), C(13,14), C(15,16)];
auto fft2 = fft(arr2);
assert(approxEqual(map!"a.re"(fft2),
[64.0, -27.3137, -16, -11.3137, -8, -4.6862, 0, 11.3137]));
assert(approxEqual(map!"a.im"(fft2),
[72, 11.3137, 0, -4.686, -8, -11.3137, -16, -27.3137]));
auto inv1 = inverseFft(fft1);
assert(approxEqual(map!"a.re"(inv1), arr));
assert(reduce!max(map!"a.im"(inv1)) < 1e-10);
auto inv2 = inverseFft(fft2);
assert(approxEqual(map!"a.re"(inv2), map!"a.re"(arr2)));
assert(approxEqual(map!"a.im"(inv2), map!"a.im"(arr2)));
}
// Swaps the real and imaginary parts of a complex number. This is useful
// for inverse FFTs.
C swapRealImag(C)(C input) {
return C(input.im, input.re);
}
private:
// The reason I couldn't use std.algorithm was b/c its stride length isn't
// modifiable on the fly.
struct Stride(R) {
R range;
size_t _nSteps;
size_t _length;
alias ElementType!(R) E;
this(R range, size_t nStepsIn) {
this.range = range;
_nSteps = nStepsIn;
_length = (range.length + _nSteps - 1) / nSteps;
}
size_t length() const @property {
return _length;
}
E opIndex(size_t index) {
return range[index * _nSteps];
}
E front() {
return range[0];
}
void popFront() {
static if(hasSlicing!R) {
if(range.length >= _nSteps) {
range = range[_nSteps..$];
_length--;
} else {
range.length = 0;
_length = 0;
}
} else {
foreach(i; 0.._nSteps) {
if(range.empty) {
break;
}
range.popFront();
}
_length--;
}
}
bool empty() const @property {
return length == 0;
}
size_t nSteps() const @property {
return _nSteps;
}
size_t nSteps(size_t newVal) @property {
_nSteps = newVal;
_length = (range.length + _nSteps - 1) / nSteps;
return newVal;
}
}
///////////////////////////////////////////////////////////////////////////////
// The following code caches sine and cosine terms for small sub-FFTs. For
// larger FFTs it's faster to compute these terms on the fly because
// of cache effects and because they're not used as often, but for smaller ones,
// we use CTFE and store pre-computed values in an immutable array.
// This cache size seems fairly optimial because:
//
// 1. It will fit in most L1 data caches.
// (It leads to a total cache size of 32K)
//
// 2. DMD blows up if I go up to 8192 and uses several hundred megs of RAM.
//
// 3. It seems empirically like bigger caches don't give much, if any, speedup.
enum nCache = 4096;
double[nCache] computeCache() pure nothrow {
// This computes a negative sine cache by shifting sines by PI / 2. It's
// meant to be evaluated at compile time.
double[nCache] ret;
foreach(i; 0..ret.length) {
ret[(i + nCache / 2) % nCache] = sin(i * 2 * PI / nCache);
}
return ret;
}
immutable double[nCache] trigCache = computeCache();
double negSinFromCache(size_t index) pure nothrow {
return trigCache[index];
}
double cosFromCache(size_t index) pure nothrow {
// cos is just -sin shifted by PI * 3 / 4. Trusting the compiler to
// optimize the readable but inefficient but clearly compiler-optimizable
// index calculation.
return trigCache[(index + nCache * 3 / 4) % nCache];
}
///////////////////////////////////////////////////////////////////////////////
void fftImpl(Ret, R)(Stride!R range, Ret buf) {
if(range.length <= 2) {
slowFourier2(range, buf);
return;
}
auto left = range;
auto right = range;
right.popFront;
left.nSteps = left.nSteps * 2;
right.nSteps = right.nSteps * 2;
assert(left.length == right.length);
auto leftBuf = buf[0..$ / 2];
auto rightBuf = buf[$ / 2..$];
fftImpl(left, leftBuf);
fftImpl(right, rightBuf);
immutable halfLen = range.length / 2;
void doLoop(bool useCache)() {
static if(useCache) {
assert(range.length <= nCache);
immutable multiplier = nCache / range.length;
} else {
immutable step = 2.0 * PI / range.length;
}
// Using ptrdiff_t here b/c converting a signed int to a float is
actually
// measurably faster than converting an unsigned int to a float in this
// context.
foreach(ptrdiff_t k; 0..halfLen) {
static if(useCache) {
immutable cosTwiddle = cosFromCache(k * multiplier);
immutable sinTwiddle = negSinFromCache(k * multiplier);
} else {
immutable loc = step * k;
immutable cosTwiddle = cos(loc);
immutable sinTwiddle = -sin(loc);
}
immutable upperIndex = k + halfLen;
immutable realLower = buf[k].re;
immutable imagLower = buf[k].im;
immutable realUpper = buf[upperIndex].re;
immutable imagUpper = buf[upperIndex].im;
immutable realAdd = cosTwiddle * realUpper - sinTwiddle * imagUpper;
immutable imagAdd = sinTwiddle * realUpper + cosTwiddle * imagUpper;
buf[k].re += realAdd;
buf[k].im += imagAdd;
buf[upperIndex].re = realLower - realAdd;
buf[upperIndex].im = imagLower - imagAdd;
}
}
if(range.length > nCache) {
doLoop!false();
} else {
doLoop!true();
}
}
// Hard-coded base case for FFT of size 2. This is actually a TON faster than
// using a generic slow DFT.
void slowFourier2(Ret, R)(R range, Complex!Ret[] buf) {
buf[0] = range[0] + range[1];
buf[1] = range[0] - range[1];
}
bool isPowerOfTwo(size_t num) {
// BUGS: std.intrinsic takes a uint, not a size_t. Therefore, this
// won't work on 64-bit unless std.intrinsic is fixed.
return bsr(num) == bsf(num);
}
template isComplexLike(T) {
enum bool isComplexLike = is(typeof(T.init.re)) &&
is(typeof(T.init.im));
}
unittest {
static assert(isComplexLike!(Complex!double));
static assert(!isComplexLike!(uint));
}
_______________________________________________
phobos mailing list
[email protected]
http://lists.puremagic.com/mailman/listinfo/phobos