On Thu, Sep 11, 2014 at 11:27 AM, Andres Freund <[email protected]>
wrote:
> On 2014-09-11 10:32:24 -0300, Arthur Silva wrote:
> > Unaligned memory access received a lot attention in Intel post-Nehalen
> era.
> > So it may very well pay off on Intel servers. You might find this blog
> post
> > and it's comments/external-links interesting
> >
> http://lemire.me/blog/archives/2012/05/31/data-alignment-for-speed-myth-or-reality/
>
> FWIW, the reported results of imo pretty meaningless for postgres. It's
> sequential access over larger amount of memory. I.e. a perfectly
> prefetchable workload where it doesn't matter if superflous cachelines
> are fetched because they're going to be needed next round anyway.
>
> In many production workloads one of the most busy accesses to individual
> datums is the binary search on individual pages during index
> lookups. That's pretty much exactly the contrary to the above.
>
> Not saying that it's not going to be a benefit in many scenarios, but
> it's far from being as simple as saying that unaligned accesses on their
> own aren't penalized anymore.
>
> Greetings,
>
> Andres Freund
>
> --
> Andres Freund http://www.2ndQuadrant.com/
> PostgreSQL Development, 24x7 Support, Training & Services
>
I modified the test code to use a completely random scan pattern to test
something that completely trashes the cache. Not realistic but still
confirms the hypothesis that the overhead is minimal on modern Intel.
------------------ test results compiling for 32bit ------------------
processing word of size 2
offset = 0
average time for offset 0 is 422.7
offset = 1
average time for offset 1 is 422.85
processing word of size 4
offset = 0
average time for offset 0 is 436.6
offset = 1
average time for offset 1 is 451
offset = 2
average time for offset 2 is 444.3
offset = 3
average time for offset 3 is 441.9
processing word of size 8
offset = 0
average time for offset 0 is 630.15
offset = 1
average time for offset 1 is 653
offset = 2
average time for offset 2 is 655.5
offset = 3
average time for offset 3 is 660.85
offset = 4
average time for offset 4 is 650.1
offset = 5
average time for offset 5 is 656.9
offset = 6
average time for offset 6 is 656.6
offset = 7
average time for offset 7 is 656.9
------------------ test results compiling for 64bit ------------------
processing word of size 2
offset = 0
average time for offset 0 is 402.55
offset = 1
average time for offset 1 is 406.9
processing word of size 4
offset = 0
average time for offset 0 is 424.05
offset = 1
average time for offset 1 is 436.55
offset = 2
average time for offset 2 is 435.1
offset = 3
average time for offset 3 is 435.3
processing word of size 8
offset = 0
average time for offset 0 is 444.9
offset = 1
average time for offset 1 is 470.25
offset = 2
average time for offset 2 is 468.95
offset = 3
average time for offset 3 is 476.75
offset = 4
average time for offset 4 is 474.9
offset = 5
average time for offset 5 is 468.25
offset = 6
average time for offset 6 is 469.8
offset = 7
average time for offset 7 is 469.1
// g++ -O2 -o test test.cpp && ./test
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <iostream>
#include <cassert>
#include <vector>
#include "inttypes.h"
using namespace std;
class WallClockTimer
{
public:
struct timeval t1, t2;
WallClockTimer() :
t1(), t2()
{
gettimeofday(&t1, 0);
t2 = t1;
}
void reset()
{
gettimeofday(&t1, 0);
t2 = t1;
}
int elapsed()
{
return (t2.tv_sec * 1000 + t2.tv_usec / 1000) - (t1.tv_sec * 1000 + t1.tv_usec / 1000);
}
int split()
{
gettimeofday(&t2, 0);
return elapsed();
}
};
// xor shift
uint32_t xor128(void)
{
static uint32_t x = 123456789;
static uint32_t y = 362436069;
static uint32_t z = 521288629;
static uint32_t w = 88675123;
uint32_t t;
t = x ^ (x << 11);
x = y;
y = z;
z = w;
return w = w ^ (w >> 19) ^ (t ^ (t >> 8));
}
template <class T>
void runtest()
{
size_t N = 10 * 1000 * 1000 ;
int repeat = 20;
WallClockTimer timer;
const bool paranoid = false;
cout<<" processing word of size "<<sizeof(T)<<endl;
for(unsigned int offset = 0; offset<sizeof(T); ++offset)
{
vector<T> bigarray(N+2);
cout<<"offset = "<<offset<<endl;
T * const begin = reinterpret_cast<T *> (reinterpret_cast<uintptr_t>(&bigarray[0]) + offset);
assert(offset + reinterpret_cast<uintptr_t>(&bigarray[0]) == reinterpret_cast<uintptr_t>(begin) );
T * const end = begin + N;
if(paranoid) assert(reinterpret_cast<uintptr_t>(end)<reinterpret_cast<uintptr_t>(&bigarray.back()));
int sumt = 0;
//cout<<" ignore this: ";
for(int k = 0 ; k < repeat; ++k)
{
timer.reset();
for(size_t i = 0; i <N; ++i)
{
int ri = xor128() % N;
begin[ri] = static_cast<T>( i );
}
volatile T val = 1;
for(size_t i = 0; i <N; ++i)
{
int ri = xor128() % N;
val += begin[ri] * val + 33;
}
int time = timer.split();
sumt += time;
//cout<<val;
}
//cout<<endl;
cout<<" average time for offset "<<(offset%sizeof(T))<<" is "<<sumt * 1.0 /repeat<<endl;
}
}
int main()
{
runtest<uint16_t>();
cout<<endl;
runtest<uint32_t>();
cout<<endl;
runtest<uint64_t>();
cout<<endl;
return 0;
}
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers