On Tue, Jun 21, 2011 at 8:51 PM, Kai Sterker <kai.ster...@gmail.com> wrote:
> But here's the problem: if I create some connector templates and > somebody else as well, they end up having the same id and things will > get ugly when we both try to commit and merge our connector template > data file. > > My idea would be to instead get a hash of, say, the host name where > the editor runs and XOR that with the simple counter to get values > that are locally and globally unique. Sounds about right? [snip] > I guess my main problem is that I don't really have any experience > with hashes and such. Will a short string such as the host name > provide for unique enough hashes that XORing with a counter will still > avoid collisions. How long should the hash be? 32 bits would be > convenient, and should be plenty. Or play it safe and use 64 bits? And > is the XOR such a good idea, or would it be better to have the hash as > prefix and then additional bits for the counter? > > So if anyone has any insights to share, that would be appreciated. > Otherwise I guess I have some reading to do, since the last thing I > want to do is changing the id generation algorithm after a lot of > models and stuff have been created. Well, I did some reading and experimented with stuff I'd like to share. First of, there's a standard for globally unique ids: UUIDs.[1]. But these are 128 bits in size and therefore too long as far as I am concerned. So I investigated further into the area of hashes and found that the probability of a collision depends greatly on the subset of distinct input values in contrast to the number of possible hashes [2]. So that got me thinking about how many connectors or models will we really have, and how many distinct people will be working on those. Honestly, though I'd wish for many more, I doubt that latter will be more than a hand full and former probably a value in the low thousands, more or less distributed evenly among the few contributors. That led me to the conclusion, that a 32 bit hash should be sufficient. So I picked two random implementations [3][4] and wrote a little test program that basically generates 1000 distinct strings and for each string integers in the range [0; 16384[. It than hashes the string and XORs with the integer, then checks the result for duplicates in the prior results. For comparison, instead of XORing with the hash, I used the integer as a seed for the hashing. This ought to simulate 1000 people generating ~16000 files each. Much more than expected. I then tweaked the numbers and repeated the exercise. Here a couple of the results: Hash | "People" | "Files" | Collisions (seed) | Collisions (xor) ================================================================ [3] | 1000 | 16384 | 30997 | 16384 [4] | 1000 | 16384 | 35164 | 16384 ---------------------------------------------------------------- [3] | 100 | 163840 | 31043 | 131072 [4] | 100 | 163840 | 33574 | 0 ---------------------------------------------------------------- [3] | 10 | 8192 | 2 | 0 [4] | 10 | 8192 | 0 | 0 ---------------------------------------------------------------- [3] | 10000 | 4096 | 194736 | 221184 [4] | 10000 | 4096 | 219526 | 212992 ---------------------------------------------------------------- It seems the xor generally fares better, at least when there are few contributors. In the more realistic case with very few contributors and few files, there's hardly any difference. That's certainly not exhaustive, but my feeling tells me that we should be okay with a 32 bit hash for both connectors and map objects. The trick will be to keep the running number low, which should be easy for the connectors, as they will all be stored in one file. For map objects, which are kept distributed over a tree of directories, it will be more difficult. Gotta think of something there. Anyway, I'll attach the two test programs. Feel free to experiment a bit. Compile with g++ hash32.cc -std=c++0x -o hash32 g++ hashsf.cc -std=c++0x -o hashsf Thoughts are still welcome. However, I'm finally back in the mood for programming, so don't wait too long with your input. Kai [1] http://en.wikipedia.org/wiki/Universally_unique_identifier [2] http://en.wikipedia.org/wiki/Birthday_problem#Probability_table [3] http://burtleburtle.net/bob/hash/evahash.html [4] http://www.azillionmonkeys.com/qed/hash.html
#include <unistd.h> #include <cstring> #include <cstdio> #include <cstdlib> #include <tr1/unordered_set> using std::tr1::unordered_set; typedef unsigned long int u4; /* unsigned 4-byte type */ typedef unsigned char u1; /* unsigned 1-byte type */ /* The mixing step */ #define mix(a,b,c) \ { \ a=a-b; a=a-c; a=a^(c>>13); \ b=b-c; b=b-a; b=b^(a<<8); \ c=c-a; c=c-b; c=c^(b>>13); \ a=a-b; a=a-c; a=a^(c>>12); \ b=b-c; b=b-a; b=b^(a<<16); \ c=c-a; c=c-b; c=c^(b>>5); \ a=a-b; a=a-c; a=a^(c>>3); \ b=b-c; b=b-a; b=b^(a<<10); \ c=c-a; c=c-b; c=c^(b>>15); \ } /* The whole new hash function */ u4 hash( register u1 *k, u4 length, u4 initval) { register u4 a,b,c; /* the internal state */ u4 len; /* how many key bytes still need mixing */ /* Set up the internal state */ len = length; a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */ c = initval; /* variable initialization of internal state */ /*---------------------------------------- handle most of the key */ while (len >= 12) { a=a+(k[0]+((u4)k[1]<<8)+((u4)k[2]<<16) +((u4)k[3]<<24)); b=b+(k[4]+((u4)k[5]<<8)+((u4)k[6]<<16) +((u4)k[7]<<24)); c=c+(k[8]+((u4)k[9]<<8)+((u4)k[10]<<16)+((u4)k[11]<<24)); mix(a,b,c); k = k+12; len = len-12; } /*------------------------------------- handle the last 11 bytes */ c = c+length; switch(len) /* all the case statements fall through */ { case 11: c=c+((u4)k[10]<<24); case 10: c=c+((u4)k[9]<<16); case 9 : c=c+((u4)k[8]<<8); /* the first byte of c is reserved for the length */ case 8 : b=b+((u4)k[7]<<24); case 7 : b=b+((u4)k[6]<<16); case 6 : b=b+((u4)k[5]<<8); case 5 : b=b+k[4]; case 4 : a=a+((u4)k[3]<<24); case 3 : a=a+((u4)k[2]<<16); case 2 : a=a+((u4)k[1]<<8); case 1 : a=a+k[0]; /* case 0: nothing left to add */ } mix(a,b,c); /*-------------------------------------------- report the result */ return c; } void gen_random(char *s, const int len) { static unordered_set<char*> ht1; static const char alphanum[] = "0123456789" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz"; for (int i = 0; i < len; ++i) { s[i] = alphanum[rand() % (sizeof(alphanum) - 1)]; } s[len] = 0; if (ht1.insert(strdup(s)).second == false) return gen_random(s, len); } int main (int argc, char* argv[]) { unordered_set<unsigned int> ht1, ht2; char hostname[255]; gethostname(hostname, 254); int len = strlen(hostname); for (int j = 0; j < 1000; j++) { for (int i = 0; i < 16384; i++) { unsigned int res = hash((u1*)hostname, len, i); ht1.insert(res); res = hash((u1*)hostname, len, 0); ht2.insert(res^i); } gen_random(hostname, len); } printf ("%i duplicates\n", (int)(16384*1000 - ht1.size())); printf ("%i duplicates\n", (int)(16384*1000 - ht2.size())); return 0; }
#include <unistd.h> #include <cstring> #include <cstdio> #include <cstdlib> #include <cstdint> #include <tr1/unordered_set> using std::tr1::unordered_set; #undef get16bits #if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) #define get16bits(d) (*((const uint16_t *) (d))) #endif #if !defined (get16bits) #define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\ +(uint32_t)(((const uint8_t *)(d))[0]) ) #endif uint32_t SuperFastHash (const char * data, int len, uint32_t hash) { uint32_t tmp; int rem; if (len <= 0 || data == NULL) return 0; rem = len & 3; len >>= 2; /* Main loop */ for (;len > 0; len--) { hash += get16bits (data); tmp = (get16bits (data+2) << 11) ^ hash; hash = (hash << 16) ^ tmp; data += 2*sizeof (uint16_t); hash += hash >> 11; } /* Handle end cases */ switch (rem) { case 3: hash += get16bits (data); hash ^= hash << 16; hash ^= data[sizeof (uint16_t)] << 18; hash += hash >> 11; break; case 2: hash += get16bits (data); hash ^= hash << 11; hash += hash >> 17; break; case 1: hash += *data; hash ^= hash << 10; hash += hash >> 1; } /* Force "avalanching" of final 127 bits */ hash ^= hash << 3; hash += hash >> 5; hash ^= hash << 4; hash += hash >> 17; hash ^= hash << 25; hash += hash >> 6; return hash; } void gen_random(char *s, const int len) { static unordered_set<char*> ht1; static const char alphanum[] = "0123456789" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz"; for (int i = 0; i < len; ++i) { s[i] = alphanum[rand() % (sizeof(alphanum) - 1)]; } s[len] = 0; if (ht1.insert(strdup(s)).second == false) return gen_random(s, len); } int main (int argc, char* argv[]) { unordered_set<unsigned int> ht1, ht2; char hostname[255]; gethostname(hostname, 254); int len = strlen(hostname); for (int j = 0; j < 1000; j++) { for (int i = 0; i < 16384; i++) { unsigned int res = SuperFastHash(hostname, len, i); ht1.insert(res); res = SuperFastHash(hostname, len, len); ht2.insert(res^i); } gen_random(hostname, len); } printf ("%i duplicates\n", (int)(16384*1000 - ht1.size())); printf ("%i duplicates\n", (int)(16384*1000 - ht2.size())); return 0; }
_______________________________________________ Adonthell-devel mailing list Adonthell-devel@nongnu.org https://lists.nongnu.org/mailman/listinfo/adonthell-devel