On 7/19/07, Richard Guenther <[EMAIL PROTECTED]> wrote:
Of course, if any then the array indexing variant is fixed. It would be nice to see a complete testcase with a pessimization, maybe you can file a bugreport about this?
There's many issues for all alternatives and i'm not qualified to pinpoint them further. I've taken http://ompf.org/ray/sphereflake/ which is used as a benchmark already here http://www.suse.de/~gcctest/c++bench/raytracer/, because it's small, self contained and has such a basic 3 component class that's used all over. It doesn't use any kind of array access operator, but it's good enough to show the price one has to pay before even thinking of providing some. It has been adjusted to use floats and access members through accessors (to allow for a straighter comparison of all cases).
variation 0 is the reference, a mere struct { float x,y,z; ...};, performs as good as the original, but wouldn't allow for any 'valid' indexing. variation 1 is struct { float f[3]; ... } variations 2,3,4,5 try to use some union # /usr/local/gcc-4.3-20070720/bin/g++ -v Using built-in specs. Target: x86_64-unknown-linux-gnu Configured with: ../configure --prefix=/usr/local/gcc-4.3-20070720 --enable-languages=c,c++ --enable-threads=posix --disable-checking --disable-nls --disable-shared --disable-win32-registry --with-system-zlib --disable-multilib --verbose --with-gcc=gcc-4.2 --with-gnu-ld --with-gnu-as --enable-checking=none --disable-bootstrap Thread model: posix gcc version 4.3.0 20070720 (experimental) # make bench [snip] sf.v0 real 0m3.963s user 0m3.812s sys 0m0.152s sf.v1 real 0m3.972s user 0m3.864s sys 0m0.104s sf.v2 real 0m10.384s user 0m10.261s sys 0m0.120s sf.v3 real 0m10.390s user 0m10.289s sys 0m0.104s sf.v4 real 0m10.388s user 0m10.265s sys 0m0.124s sf.v5 real 0m10.399s user 0m10.281s sys 0m0.116s There's some inlining difference between union variations and the first two, but they clearly stand in their own league anyway. So we can only seriously consider the first two. Variation #0 would ask for invalid c++ (pointer arithmetic abuse, not an option anymore) or forbidding array access operator and going to set/get + memcpy, but pretty optimal. Variation #1 (straight array) is quite annoying in C++ (no initializer list, need to reformulate all access etc...) and already show some slight pessimization, but it's not easy to track. Apparently g++ got a bit better lately in this regard, or it's only blatant on larger data or more complex cases. I hope this shows how problematic it is for the end user.
// sphere flake bvh raytracer (c) 2005, thierry berger-perrin <[EMAIL PROTECTED]> // this code is released under the GNU Public License. // see http://ompf.org/ray/sphereflake/ // compile with ie g++ -O2 -ffast-math sphereflake.cc // usage: ./sphereflake [lvl=6] >pix.ppm #include <cmath> #include <iostream> #include <cstdlib> #include <limits> #define GIMME_SHADOWS enum { childs = 9, ss= 2, ss_sqr = ss*ss }; /* not really tweakable anymore */ static const float infinity = std::numeric_limits<float>::infinity(), epsilon = 1e-4f; #if VARIATION == 5 union v_t { // straight union; array left unharmed; just as horrible as the others. struct { float _x, _y, _z; }; float f[3]; v_t(const float a, const float b, const float c) : _x(a), _y(b), _z(c) {} float x() const { return _x; } float &x() { return _x; } float y() const { return _y; } float &y() { return _y; } float z() const { return _z; } float &z() { return _z; } #else struct v_t { #endif #if VARIATION == 0 // best of the breed, but doesn't give way for an 'array access' operator. float _x, _y, _z; v_t(const float a, const float b, const float c) : _x(a), _y(b), _z(c) {} float x() const { return _x; } float &x() { return _x; } float y() const { return _y; } float &y() { return _y; } float z() const { return _z; } float &z() { return _z; } #elif VARIATION == 1 // not as good, obvious 'array access' but forbids initializer lists float f[3]; v_t(const float a, const float b, const float c) { f[0] = a; f[1] = b; f[2] = c; } float x() const { return f[0]; } float &x() { return f[0]; } float y() const { return f[1]; } float &y() { return f[1]; } float z() const { return f[2]; } float &z() { return f[2]; } #elif VARIATION == 2 // Richard Guenther's suggestion, worst of the worst. union { struct { float x, y, z; } a; float b[3]; } u; v_t(const float i, const float j, const float k) { u.a.x = i; u.a.y = j; u.a.z = k; } float x() const { return u.a.x; } float &x() { return u.a.x; } float y() const { return u.a.y; } float &y() { return u.a.y; } float z() const { return u.a.z; } float &z() { return u.a.z; } #elif VARIATION == 3 // slightly better than variation #2, but still terrible. union { struct { float _x, _y, _z; }; float f[3]; }; v_t(const float a, const float b, const float c) : _x(a), _y(b), _z(c) {} float x() const { return _x; } float &x() { return _x; } float y() const { return _y; } float &y() { return _y; } float z() const { return _z; } float &z() { return _z; } #elif VARIATION == 4 // ditto. union { struct { float _x, _y, _z; }; float f[3]; }; v_t(const float a, const float b, const float c) { f[0] = a; f[1] = b; f[2] = c; } float x() const { return f[0]; } float &x() { return f[0]; } float y() const { return f[1]; } float &y() { return f[1]; } float z() const { return f[2]; } float &z() { return f[2]; } #endif v_t() {} v_t operator+(const v_t &v) const { return v_t(x()+v.x(),y()+v.y(),z()+v.z()); } v_t operator-(const v_t &v) const { return v_t(x()-v.x(),y()-v.y(),z()-v.z()); } v_t operator-() const { return v_t(-x(),-y(),-z()); } v_t operator*(const float d) const { return v_t(x()*d,y()*d,z()*d);} v_t cross(const v_t &v) const { return v_t(y()*v.z()-z()*v.y(),z()*v.x()-x()*v.z(),x()*v.y()-y()*v.x()); } v_t norm() const { return*this*(1.f/sqrtf(magsqr())); } float dot(const v_t &v) const { return x()*v.x()+y()*v.y()+z()*v.z(); } float magsqr() const { return dot(*this); } }; //static const v_t light(v_t(0.5,-.95,1.775).norm()); /*pick one*/ static const v_t light(v_t(-0.5f, -.65f , .9f).norm()); /*fiat lux*/ struct ray_t{ v_t o,d; ray_t(const v_t&v):o(v){} ray_t(const v_t&v,const v_t&w):o(v),d(w){} }; struct hit_t { v_t n; float t; hit_t():n(v_t(0,0,0)),t(infinity){} }; struct sphere_t{ v_t o; float r; sphere_t(){} sphere_t(const v_t &v, float d) : o(v), r(d) {} v_t get_normal(const v_t &v) const { return (v-o)*(1.f/r); } float intersect(const ray_t &ray)const{ const v_t v(o-ray.o); const float b = ray.d.dot(v),disc = b*b-v.magsqr()+r*r; if(disc < 0.f) return infinity; /*branch away from the square root*/ const float d = sqrtf(disc), t2 = b+d, t1 = b-d; /*cond. move*/ if(t2 < 0.f) return infinity; else return(t1 > 0.f ? t1 : t2); } }; struct node_t; static node_t *pool=0, *end=0; struct node_t { /*a bvh in array form+skip for navigation.*/ sphere_t bound,leaf; long diff;/*far from optimal*/ node_t(){} node_t(const sphere_t&b,const sphere_t&l,const long jump) :bound(b),leaf(l),diff(jump){} template<bool shadow> static void intersect(const ray_t &ray,hit_t &hit){ const node_t*p=pool; while(p < end) { if(p->bound.intersect(ray)>=hit.t) /*missed bound*/ p+=p->diff; /*skip subtree*/ else{ const float t=p->leaf.intersect(ray); if(t < hit.t) { /*if hit, update, then break for shadows*/ hit.t=t; if(shadow) break; hit.n=p->leaf.get_normal(ray.o+ray.d*t); } ++p; /*next!*/ } } } }; static float ray_trace(const node_t*const scene,const ray_t&ray) { hit_t hit; scene->intersect<false>(ray,hit);// trace primary const float diffuse = hit.t==infinity ? 0.f : -hit.n.dot(light); #ifdef GIMME_SHADOWS if (diffuse <= 0.f) return 0.f; const ray_t sray(ray.o+(ray.d*hit.t) + (hit.n*epsilon), -light); hit_t shit; scene->intersect<true>(sray,shit);// trace shadow return shit.t==infinity ? diffuse : 0.f; #else return diffuse > 0.f ? diffuse : 0.f; #endif } static const float grid[ss_sqr][2]={ /*our rotated grid*/ { -3/3.f, -1/3.f }, { +1/3.f, -3/3.f }, { -1/3.f, +3/3.f }, { +3/3.f, +1/3.f } }; static void trace_rgss(const int width,const int height) { const float w = width, h = height,rcp =1/float(ss), scale = 256.f/float(ss_sqr); ray_t ray(v_t(0, 0, -4.5f)); /* eye, looking into Z */ v_t rgss[ss_sqr]; for(int i=0;i<ss_sqr;++i) /*precomp.*/ rgss[i]=v_t(grid[i][0]*rcp - w/2.f, grid[i][1]*rcp - h/2.f, 0); v_t scan(0,w-1,std::max(w,h)); /*scan line*/ for(int i=height;i;--i) { for(int j=width;j;--j) { float g=0; for(int idx=0;idx < ss_sqr;++idx){ /*AA*/ ray.d=(scan+rgss[idx]).norm(); g+=ray_trace(pool,ray); /*trace*/ } int sat = int(scale*g); sat = sat < 0 ? 0 : sat; sat = sat > 255 ? 255 : sat; std::cout << sat << " "; scan.x() += 1; /*next pixel*/ } scan.x()=0;scan.y()-=1; /*next line*/ } std::cout << std::endl; } struct basis_t{ /* bogus and compact, exactly what we need */ v_t up,b1,b2; basis_t(const v_t&v){ const v_t n(v.norm()); if ((n.x()*n.x() !=1.)&(n.y()*n.y() !=1.)&(n.z()*n.z() !=1.)) {/*cough*/ b1=n; if(n.y()*n.y()>n.x()*n.x()) { if(n.y()*n.y()>n.z()*n.z()) b1.y()=-b1.y(); else b1.z()=-b1.z(); } else if(n.z()*n.z() > n.x()*n.x()) b1.z()=-b1.z(); else b1.x()=-b1.x(); } else b1=v_t(n.z(),n.x(),n.y());/*leaves some cases out,dodge them*/ up=n; b2=up.cross(b1); b1=up.cross(b2); } }; static node_t *create(node_t*n,const int lvl,int dist,v_t c,v_t d,float r) { n = 1 + new (n) node_t(sphere_t(c,2.f*r), sphere_t(c,r), lvl > 1 ? dist : 1); if (lvl <= 1) return n; /*if not at the bottom, recurse a bit more*/ dist = std::max((dist-childs)/childs,1); const basis_t b(d); const float nr = r*1/3.f, daL = float(2*M_PI/6.f),daU = float(2*M_PI/3.f); float a = 0; for(int i=0;i<6;++i){ /*lower ring*/ const v_t ndir((d*-.2f + b.b1*sinf(a) + b.b2*cosf(a)).norm()); /*transcendentals?!*/ n = create(n, lvl-1, dist, c+ndir*(r+nr), ndir, nr); a += daL; } a -= daL/3.f;/*tweak*/ for(int i=0;i<3;++i){ /*upper ring*/ const v_t ndir((d*+.6f + b.b1*sinf(a) + b.b2*cosf(a)).norm()); n=create(n,lvl-1,dist,c+ndir*(r+nr),ndir,nr); a+=daU; } return n; } int main(int argc,char*argv[]){ enum{ w = 1024, h = w }; /* resolution */ const int lvl=(argc==2?std::max(std::atoi(argv[1]),2):6); int count=childs, dec=lvl; while(--dec > 1) count=(count*childs)+childs; ++count; // std::cerr << count << " spheres,claiming " << (count*sizeof(node_t))/(1024.*1024) << " MB." << std::endl; pool=new node_t[count]; /* raw */ end=pool+count; create(pool, lvl, count, v_t(0,0,0), v_t(+.25f, +1, -.5f).norm(), 1.f); /* cooked */ std::cout << "P2\n" << w << " " << h << "\n256\n"; trace_rgss(w,h); /* served */ return 0; }
Makefile
Description: Binary data