On 7/19/07, Richard Guenther <[EMAIL PROTECTED]> wrote:
Of course, if any then the array indexing variant is fixed.  It would be nice
to see a complete testcase with a pessimization, maybe you can file
a bugreport about this?
There's many issues for all alternatives and i'm not qualified to
pinpoint them further.
I've taken http://ompf.org/ray/sphereflake/ which is used as a
benchmark already here
http://www.suse.de/~gcctest/c++bench/raytracer/, because it's small,
self contained and has such a basic 3 component class that's used all
over.
It doesn't use any kind of array access operator, but it's good enough
to show the price one has to pay before even thinking of providing
some. It has been adjusted to use floats and access members through
accessors (to allow for a straighter comparison of all cases).

variation 0 is the reference, a mere struct { float x,y,z; ...};,
performs as good as the original, but wouldn't allow for any 'valid'
indexing.
variation 1 is struct { float f[3]; ... }
variations 2,3,4,5 try to use some union

# /usr/local/gcc-4.3-20070720/bin/g++ -v
Using built-in specs.
Target: x86_64-unknown-linux-gnu
Configured with: ../configure --prefix=/usr/local/gcc-4.3-20070720
--enable-languages=c,c++ --enable-threads=posix --disable-checking
--disable-nls --disable-shared --disable-win32-registry
--with-system-zlib --disable-multilib --verbose --with-gcc=gcc-4.2
--with-gnu-ld --with-gnu-as --enable-checking=none --disable-bootstrap
Thread model: posix
gcc version 4.3.0 20070720 (experimental)
# make bench
[snip]
sf.v0

real    0m3.963s
user    0m3.812s
sys     0m0.152s
sf.v1

real    0m3.972s
user    0m3.864s
sys     0m0.104s
sf.v2

real    0m10.384s
user    0m10.261s
sys     0m0.120s
sf.v3

real    0m10.390s
user    0m10.289s
sys     0m0.104s
sf.v4

real    0m10.388s
user    0m10.265s
sys     0m0.124s
sf.v5

real    0m10.399s
user    0m10.281s
sys     0m0.116s

There's some inlining  difference between union variations and the
first two, but they clearly stand in their own league anyway.
So we can only seriously consider the first two.
Variation #0 would ask for invalid c++ (pointer arithmetic abuse, not
an option anymore) or forbidding array access operator and going to
set/get + memcpy, but pretty optimal.
Variation #1 (straight array) is quite annoying in C++ (no initializer
list, need to reformulate all access etc...) and already show some
slight pessimization, but it's not easy to track. Apparently g++ got a
bit better lately in this regard, or it's only blatant on larger data
or more complex cases.

I hope this shows how problematic it is for the end user.
// sphere flake bvh raytracer (c) 2005, thierry berger-perrin <[EMAIL PROTECTED]>
// this code is released under the GNU Public License.
// see http://ompf.org/ray/sphereflake/
// compile with ie g++ -O2 -ffast-math sphereflake.cc
// usage: ./sphereflake [lvl=6] >pix.ppm
#include <cmath>
#include <iostream>
#include <cstdlib>
#include <limits>
#define GIMME_SHADOWS

enum { childs = 9, ss= 2, ss_sqr = ss*ss }; /* not really tweakable anymore */
static const float infinity = std::numeric_limits<float>::infinity(), epsilon = 1e-4f;


#if VARIATION == 5
union v_t {
	// straight union; array left unharmed; just as horrible as the others.
	struct { float _x, _y, _z; };
	float f[3];
	v_t(const float a, const float b, const float c) : _x(a), _y(b), _z(c) {}
	float x() const { return _x; }
	float &x()      { return _x; }
	float y() const { return _y; }
	float &y()      { return _y; }
	float z() const { return _z; }
	float &z()      { return _z; }
	
#else
struct v_t {
#endif
#if VARIATION == 0
	// best of the breed, but doesn't give way for an 'array access' operator.
	float _x, _y, _z;
	v_t(const float a, const float b, const float c) : _x(a), _y(b), _z(c) {}
	float x() const { return _x; }
	float &x()      { return _x; }
	float y() const { return _y; }
	float &y()      { return _y; }
	float z() const { return _z; }
	float &z()      { return _z; }
#elif VARIATION == 1
	// not as good, obvious 'array access' but forbids initializer lists
	float f[3];
	v_t(const float a, const float b, const float c) { f[0] = a; f[1] = b; f[2] = c; }
	float x() const { return f[0]; }
	float &x()      { return f[0]; }
	float y() const { return f[1]; }
	float &y()      { return f[1]; }
	float z() const { return f[2]; }
	float &z()      { return f[2]; }
#elif VARIATION == 2
	// Richard Guenther's suggestion, worst of the worst.
	union {
		struct { float x, y, z; } a;
		float b[3];
	} u;
	v_t(const float i, const float j, const float k) { u.a.x = i; u.a.y = j; u.a.z = k; }
	float x() const { return u.a.x; }
	float &x()      { return u.a.x; }
	float y() const { return u.a.y; }
	float &y()      { return u.a.y; }
	float z() const { return u.a.z; }
	float &z()      { return u.a.z; }
 
#elif VARIATION == 3
	// slightly better than variation #2, but still terrible.
	union {
		struct { float _x, _y, _z; };
		float f[3];
	};
	v_t(const float a, const float b, const float c) : _x(a), _y(b), _z(c) {}
	float x() const { return _x; }
	float &x()      { return _x; }
	float y() const { return _y; }
	float &y()      { return _y; }
	float z() const { return _z; }
	float &z()      { return _z; }
#elif VARIATION == 4
	// ditto.
	union {
		struct { float _x, _y, _z; };
		float f[3];
	};
	v_t(const float a, const float b, const float c) { f[0] = a; f[1] = b; f[2] = c; }
	float x() const { return f[0]; }
	float &x()      { return f[0]; }
	float y() const { return f[1]; }
	float &y()      { return f[1]; }
	float z() const { return f[2]; }
	float &z()      { return f[2]; }
#endif
	
	
	v_t() {}
	v_t operator+(const v_t &v) const { return v_t(x()+v.x(),y()+v.y(),z()+v.z()); }
	v_t operator-(const v_t &v) const { return v_t(x()-v.x(),y()-v.y(),z()-v.z()); }
	v_t operator-() const { return v_t(-x(),-y(),-z()); }
	v_t operator*(const float d) const { return v_t(x()*d,y()*d,z()*d);}
	v_t cross(const v_t &v) const { return v_t(y()*v.z()-z()*v.y(),z()*v.x()-x()*v.z(),x()*v.y()-y()*v.x()); }
	v_t norm() const { return*this*(1.f/sqrtf(magsqr())); }
	float dot(const v_t &v) const { return x()*v.x()+y()*v.y()+z()*v.z(); }
	float magsqr() const { return dot(*this); }
};

//static const v_t light(v_t(0.5,-.95,1.775).norm()); /*pick one*/
static const v_t light(v_t(-0.5f, -.65f , .9f).norm()); /*fiat lux*/

struct ray_t{
	v_t o,d;
	ray_t(const v_t&v):o(v){}
	ray_t(const v_t&v,const v_t&w):o(v),d(w){}
};
struct hit_t {
	v_t n;
	float t;
	hit_t():n(v_t(0,0,0)),t(infinity){}
};

struct sphere_t{ 
	v_t o;
	float r;
	sphere_t(){}
	sphere_t(const v_t &v, float d) : o(v), r(d) {}
	v_t get_normal(const v_t &v) const { return (v-o)*(1.f/r); }
	float intersect(const ray_t &ray)const{
		const v_t v(o-ray.o);
		const float b = ray.d.dot(v),disc = b*b-v.magsqr()+r*r;
		if(disc < 0.f)
			return infinity; /*branch away from the square root*/
		const float d = sqrtf(disc), t2 = b+d, t1 = b-d; /*cond. move*/
		if(t2 < 0.f)
			return infinity;
		else
			return(t1 > 0.f ? t1 : t2);
	}
};

struct node_t; 
static node_t *pool=0, *end=0;

struct node_t { /*a bvh in array form+skip for navigation.*/
	sphere_t bound,leaf;
	long diff;/*far from optimal*/
	node_t(){} node_t(const sphere_t&b,const sphere_t&l,const long jump) :bound(b),leaf(l),diff(jump){}
	template<bool shadow> static void intersect(const ray_t &ray,hit_t &hit){
		const node_t*p=pool;
		while(p < end) {
			if(p->bound.intersect(ray)>=hit.t) /*missed bound*/
				p+=p->diff; /*skip subtree*/
			else{
				const float t=p->leaf.intersect(ray);
				if(t < hit.t) { /*if hit, update, then break for shadows*/
					 hit.t=t;
					if(shadow) break;
					hit.n=p->leaf.get_normal(ray.o+ray.d*t);
				}
				++p; /*next!*/
			}
		}
	}
};

static float ray_trace(const node_t*const scene,const ray_t&ray) {
	hit_t hit;
	scene->intersect<false>(ray,hit);// trace primary
	const float diffuse = hit.t==infinity ? 0.f : -hit.n.dot(light);
	#ifdef GIMME_SHADOWS
		if (diffuse <= 0.f)
			return 0.f;
		const ray_t sray(ray.o+(ray.d*hit.t) + (hit.n*epsilon), -light);
		hit_t shit;
		scene->intersect<true>(sray,shit);// trace shadow
		return shit.t==infinity ? diffuse : 0.f;
	#else
		return diffuse > 0.f ? diffuse : 0.f;
	#endif
}

static const float grid[ss_sqr][2]={ /*our rotated grid*/
	{ -3/3.f, -1/3.f }, { +1/3.f, -3/3.f },
	{ -1/3.f, +3/3.f }, { +3/3.f, +1/3.f }
};

static void trace_rgss(const int width,const int height) {
	const float w = width, h = height,rcp =1/float(ss), scale = 256.f/float(ss_sqr);
	ray_t ray(v_t(0, 0, -4.5f)); /* eye, looking into Z */
	v_t rgss[ss_sqr];
	for(int i=0;i<ss_sqr;++i) /*precomp.*/
		rgss[i]=v_t(grid[i][0]*rcp - w/2.f, grid[i][1]*rcp - h/2.f, 0);
	
	v_t scan(0,w-1,std::max(w,h)); /*scan line*/
	for(int i=height;i;--i) {
		for(int j=width;j;--j) {
			float g=0;
			for(int idx=0;idx < ss_sqr;++idx){ /*AA*/
				ray.d=(scan+rgss[idx]).norm();
				g+=ray_trace(pool,ray); /*trace*/
			}
			int sat = int(scale*g);
			sat = sat < 0 ? 0 : sat;
			sat = sat > 255 ? 255 : sat;
			std::cout << sat << " ";
			
			scan.x() += 1; /*next pixel*/
		}
		scan.x()=0;scan.y()-=1; /*next line*/
	}
	std::cout << std::endl; 
}
	
struct basis_t{ /* bogus and compact, exactly what we need */
	v_t up,b1,b2;
	basis_t(const v_t&v){ const v_t n(v.norm());
		if ((n.x()*n.x() !=1.)&(n.y()*n.y() !=1.)&(n.z()*n.z() !=1.)) {/*cough*/
			b1=n;
			if(n.y()*n.y()>n.x()*n.x()) {
				if(n.y()*n.y()>n.z()*n.z())
					b1.y()=-b1.y();
				else b1.z()=-b1.z();
			}
			else if(n.z()*n.z() > n.x()*n.x())
				b1.z()=-b1.z();
			else b1.x()=-b1.x();
		}
		else
			b1=v_t(n.z(),n.x(),n.y());/*leaves some cases out,dodge them*/
		
		up=n;
		b2=up.cross(b1);
		b1=up.cross(b2);
	}
};

static node_t *create(node_t*n,const int lvl,int dist,v_t c,v_t d,float r) {
	n = 1 + new (n) node_t(sphere_t(c,2.f*r), sphere_t(c,r), lvl > 1 ? dist : 1);
	if (lvl <= 1)
		return n; /*if not at the bottom, recurse a bit more*/
	
	dist = std::max((dist-childs)/childs,1);
	const basis_t b(d); 
	const float nr = r*1/3.f, daL = float(2*M_PI/6.f),daU = float(2*M_PI/3.f);
	float a = 0;
	for(int i=0;i<6;++i){ /*lower ring*/
		const v_t ndir((d*-.2f + b.b1*sinf(a) + b.b2*cosf(a)).norm()); /*transcendentals?!*/
		n = create(n, lvl-1, dist, c+ndir*(r+nr), ndir, nr);
		a += daL;
	}
	a -= daL/3.f;/*tweak*/
	for(int i=0;i<3;++i){ /*upper ring*/
		const v_t ndir((d*+.6f + b.b1*sinf(a) + b.b2*cosf(a)).norm());
		n=create(n,lvl-1,dist,c+ndir*(r+nr),ndir,nr); a+=daU;
	}
	return n;
}

int main(int argc,char*argv[]){
	enum{ w = 1024, h = w }; /* resolution */
	const int lvl=(argc==2?std::max(std::atoi(argv[1]),2):6);
	int count=childs, dec=lvl;
	while(--dec > 1) count=(count*childs)+childs;
	++count;
	// std::cerr << count << " spheres,claiming " << (count*sizeof(node_t))/(1024.*1024) << " MB." << std::endl;
	pool=new node_t[count];  /* raw */
	end=pool+count;
	create(pool, lvl, count, v_t(0,0,0), v_t(+.25f, +1, -.5f).norm(), 1.f); /* cooked */
	std::cout << "P2\n" << w << " " << h << "\n256\n";
	trace_rgss(w,h); /* served */
	return 0;
}

Attachment: Makefile
Description: Binary data

Reply via email to