On Sun, Dec 16, 2018 at 11:50:15AM -0500, John Naylor wrote: > A few months ago I was looking into faster search algorithms for > ScanKeywordLookup(), so this is interesting to me. While an optimal > full replacement would be a lot of work, the above ideas are much less > invasive and would still have some benefit. Unless anyone intends to > work on this, I'd like to flesh out the offset-into-giant-string > approach a bit further:
Hello John, I was pointed at your patch on IRC and decided to look into adding my own pieces. What I can provide you is a fast perfect hash function generator. I've attached a sample hash function based on the current main keyword list. hash() essentially gives you the number of the only possible match, a final strcmp/memcmp is still necessary to verify that it is an actual keyword though. The |0x20 can be dropped if all cases have pre-lower-cased the input already. This would replace the binary search in the lookup functions. Returning offsets directly would be easy as well. That allows writing a single string where each entry is prefixed with a type mask, the token id, the length of the keyword and the actual keyword text. Does that sound useful to you? Joerg
#include <stddef.h> #include <inttypes.h> uint32_t hash(const void * __restrict key, size_t keylen) { static const uint16_t g[881] = { 0x015b, 0x0000, 0x0070, 0x01b2, 0x0000, 0x0078, 0x0020, 0x0000, 0x0000, 0x0193, 0x0000, 0x0000, 0x0000, 0x01ac, 0x0000, 0x0122, 0x00b9, 0x0176, 0x013b, 0x0000, 0x0000, 0x0000, 0x0150, 0x0000, 0x0000, 0x0000, 0x008b, 0x00ea, 0x00b3, 0x0197, 0x0000, 0x0118, 0x012d, 0x0102, 0x0000, 0x0091, 0x0061, 0x008c, 0x0000, 0x0000, 0x0144, 0x01b4, 0x0000, 0x0000, 0x01a8, 0x019e, 0x0000, 0x00da, 0x0000, 0x0000, 0x0122, 0x0176, 0x00f3, 0x016a, 0x00f4, 0x00c0, 0x0111, 0x0000, 0x0103, 0x0028, 0x001a, 0x0180, 0x0000, 0x0000, 0x005f, 0x0000, 0x00d9, 0x0000, 0x016d, 0x0000, 0x0170, 0x0007, 0x016f, 0x0000, 0x014e, 0x0098, 0x00a8, 0x004b, 0x0000, 0x0056, 0x0000, 0x0121, 0x0012, 0x0102, 0x0192, 0x0000, 0x00f2, 0x0066, 0x0000, 0x003a, 0x0000, 0x0000, 0x0144, 0x0000, 0x0000, 0x0133, 0x0067, 0x0169, 0x0000, 0x0000, 0x0152, 0x0122, 0x0000, 0x0058, 0x0135, 0x0045, 0x0193, 0x00d2, 0x007e, 0x0000, 0x00ae, 0x012c, 0x0000, 0x0000, 0x0000, 0x0000, 0x0124, 0x0000, 0x0046, 0x0018, 0x0000, 0x00ba, 0x00d1, 0x004a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x001f, 0x0000, 0x0101, 0x0000, 0x0000, 0x0000, 0x01b5, 0x016e, 0x0173, 0x008a, 0x0000, 0x0173, 0x000b, 0x0000, 0x00d5, 0x005e, 0x0000, 0x00ac, 0x0000, 0x0000, 0x0000, 0x01a1, 0x0000, 0x0000, 0x0127, 0x0000, 0x005e, 0x0000, 0x016f, 0x0000, 0x012b, 0x01a4, 0x01b4, 0x0000, 0x0000, 0x003a, 0x0000, 0x0000, 0x00f5, 0x00b1, 0x0003, 0x0123, 0x001b, 0x0000, 0x004f, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x007a, 0x0000, 0x0000, 0x0000, 0x0000, 0x00c2, 0x00a2, 0x00b9, 0x0000, 0x00cb, 0x0000, 0x00d2, 0x0000, 0x0197, 0x0121, 0x0000, 0x00d6, 0x0107, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0165, 0x00df, 0x0121, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x019b, 0x0000, 0x01ad, 0x0000, 0x014f, 0x018d, 0x0000, 0x015f, 0x0168, 0x0000, 0x0199, 0x0000, 0x0000, 0x0000, 0x00a1, 0x0000, 0x0000, 0x0109, 0x0000, 0x0000, 0x01a6, 0x0097, 0x0000, 0x0018, 0x0000, 0x00d1, 0x0000, 0x0000, 0x0000, 0x0187, 0x0018, 0x0000, 0x00aa, 0x0000, 0x0000, 0x0000, 0x0000, 0x0136, 0x0063, 0x00b8, 0x0000, 0x0067, 0x0114, 0x0000, 0x0000, 0x0151, 0x0000, 0x0000, 0x0000, 0x00bf, 0x0000, 0x0000, 0x0000, 0x01b4, 0x00d4, 0x0000, 0x0006, 0x017e, 0x0167, 0x003a, 0x017f, 0x0183, 0x00c9, 0x01a2, 0x0000, 0x0000, 0x0153, 0x00ce, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0051, 0x0000, 0x0086, 0x0000, 0x0083, 0x0137, 0x0000, 0x0000, 0x0050, 0x0000, 0x00d7, 0x0000, 0x0000, 0x0000, 0x0129, 0x00f1, 0x0000, 0x009b, 0x01a7, 0x0000, 0x00b4, 0x0000, 0x00e0, 0x0046, 0x0025, 0x0000, 0x0000, 0x0000, 0x0144, 0x0000, 0x01a5, 0x0044, 0x0096, 0x0078, 0x0166, 0x0000, 0x0000, 0x0000, 0x0143, 0x0000, 0x00b8, 0x0000, 0x009e, 0x0000, 0x008c, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x00fe, 0x0000, 0x0000, 0x0037, 0x0057, 0x0000, 0x00c3, 0x0000, 0x0000, 0x0000, 0x00bf, 0x014b, 0x0069, 0x00ce, 0x0000, 0x019d, 0x007f, 0x0186, 0x0000, 0x0119, 0x0015, 0x0000, 0x000e, 0x0113, 0x0139, 0x008e, 0x01ab, 0x0000, 0x005c, 0x0000, 0x0095, 0x0000, 0x019d, 0x0000, 0x0195, 0x0036, 0x0000, 0x0000, 0x00e0, 0x0146, 0x0000, 0x0033, 0x0000, 0x0000, 0x0035, 0x0000, 0x0000, 0x0000, 0x0000, 0x00d2, 0x0000, 0x0000, 0x0000, 0x0000, 0x004c, 0x00f0, 0x0000, 0x0119, 0x00bd, 0x0000, 0x0000, 0x0031, 0x0117, 0x00b4, 0x0000, 0x00f8, 0x0000, 0x0055, 0x0000, 0x0170, 0x0000, 0x0000, 0x0000, 0x00e4, 0x00b5, 0x01b5, 0x0024, 0x0000, 0x01a5, 0x0000, 0x0000, 0x0000, 0x0000, 0x0151, 0x0000, 0x00cc, 0x0000, 0x0000, 0x0150, 0x00f3, 0x0071, 0x00d0, 0x0085, 0x0140, 0x0000, 0x00ae, 0x0000, 0x00c4, 0x01a8, 0x0000, 0x0091, 0x0180, 0x0057, 0x0072, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x002a, 0x0000, 0x0112, 0x003d, 0x017f, 0x0088, 0x0000, 0x0158, 0x0046, 0x0101, 0x0000, 0x0000, 0x00ea, 0x0000, 0x0000, 0x00b2, 0x0149, 0x0000, 0x007c, 0x0107, 0x0161, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0169, 0x0000, 0x0118, 0x0091, 0x0043, 0x0064, 0x0000, 0x0000, 0x0194, 0x0000, 0x0000, 0x00e9, 0x0000, 0x0000, 0x0000, 0x005e, 0x0000, 0x0029, 0x0000, 0x0000, 0x0000, 0x003c, 0x0000, 0x0000, 0x008b, 0x0000, 0x0000, 0x00fd, 0x002d, 0x0184, 0x0000, 0x0000, 0x016a, 0x006f, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x01a0, 0x0000, 0x003b, 0x0000, 0x006d, 0x0000, 0x016a, 0x0000, 0x01b2, 0x00c9, 0x0094, 0x0181, 0x018b, 0x0000, 0x0199, 0x00c7, 0x017e, 0x0000, 0x0000, 0x0160, 0x0000, 0x0000, 0x0175, 0x0000, 0x0000, 0x006a, 0x008d, 0x0000, 0x00ed, 0x0000, 0x00b7, 0x0000, 0x0000, 0x0107, 0x00f9, 0x0000, 0x0173, 0x0137, 0x0000, 0x0185, 0x0114, 0x006c, 0x0000, 0x0000, 0x00f4, 0x0189, 0x0000, 0x0000, 0x0102, 0x0000, 0x00d5, 0x0000, 0x0000, 0x015a, 0x0000, 0x00de, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x00f0, 0x00e1, 0x0000, 0x0000, 0x01b6, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0197, 0x0000, 0x0154, 0x004a, 0x018d, 0x0000, 0x00c3, 0x0000, 0x0000, 0x0000, 0x0000, 0x00c1, 0x0189, 0x001c, 0x0000, 0x00a6, 0x0000, 0x0000, 0x00a4, 0x0000, 0x0000, 0x0000, 0x00ed, 0x0000, 0x0173, 0x0169, 0x00d2, 0x0117, 0x0000, 0x009b, 0x0000, 0x014e, 0x0000, 0x00ac, 0x0000, 0x008e, 0x0121, 0x0104, 0x0179, 0x0000, 0x01a5, 0x0103, 0x0000, 0x001b, 0x0000, 0x0000, 0x01a8, 0x00ba, 0x010c, 0x0000, 0x0000, 0x010e, 0x00ab, 0x0062, 0x0000, 0x0000, 0x0154, 0x0122, 0x013f, 0x015f, 0x0000, 0x00f5, 0x01a9, 0x017b, 0x01a4, 0x0000, 0x0000, 0x0040, 0x0004, 0x019b, 0x0000, 0x00e3, 0x010d, 0x015b, 0x0000, 0x0104, 0x0000, 0x0000, 0x00b2, 0x00e8, 0x0000, 0x0000, 0x0000, 0x0065, 0x0062, 0x007a, 0x0000, 0x0065, 0x008d, 0x0000, 0x0085, 0x0000, 0x0000, 0x0000, 0x00af, 0x0104, 0x01ab, 0x0040, 0x00a8, 0x0000, 0x0000, 0x0000, 0x0159, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0189, 0x017b, 0x0077, 0x0000, 0x00ea, 0x00d7, 0x007f, 0x0000, 0x00ae, 0x0047, 0x0000, 0x0163, 0x0000, 0x0000, 0x0157, 0x0178, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x014d, 0x0009, 0x0000, 0x0000, 0x00b6, 0x0000, 0x0000, 0x0000, 0x0000, 0x0192, 0x01b1, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0053, 0x012b, 0x00f6, 0x0096, 0x0000, 0x0141, 0x0000, 0x0000, 0x0026, 0x0044, 0x00ce, 0x0061, 0x0199, 0x0000, 0x016b, 0x0156, 0x011d, 0x0000, 0x0038, 0x008c, 0x00c8, 0x0000, 0x0000, 0x0002, 0x0000, 0x01a1, 0x0000, 0x001e, 0x0000, 0x0000, 0x00bc, 0x00ab, 0x0000, 0x0183, 0x0085, 0x0000, 0x0000, 0x010c, 0x0000, 0x01a5, 0x0120, 0x0000, 0x0000, 0x0000, 0x0000, 0x0135, 0x0079, 0x0000, 0x01ae, 0x0028, 0x0000, 0x0000, 0x014a, 0x0000, 0x00dd, 0x0000, 0x0000, 0x00d8, 0x00de, 0x0075, 0x0000, 0x0000, 0x0021, 0x0099, 0x0000, 0x00f7, 0x0000, 0x0000, 0x0000, 0x0046, 0x0000, 0x0010, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0116, 0x0000, 0x0000, 0x0000, 0x0000, 0x01a8, 0x0000, 0x0000, 0x0000, 0x004c, 0x0000, 0x00b7, 0x0000, 0x013f, 0x003c, 0x0000, 0x0000, 0x006d, 0x007f, 0x0181, 0x0000, 0x0013, 0x0000, 0x0180, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0136, 0x000d, 0x0000, 0x0000, 0x0000, 0x0082, 0x0000, 0x0000, 0x00cf, 0x00c3, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0127, 0x0000, 0x0013, 0x019c, 0x0000, 0x0024, 0x00bd, 0x017e, 0x0000, 0x00b8, 0x002e, 0x012c, 0x0007, 0x0000, 0x0000, 0x00f7, 0x0000, 0x0048, }; uint32_t h[2]; const unsigned char *k = key; uint32_t a, b; a = b = 0x0U; while (keylen--) { a = a * 31 + (k[keylen]|0x20); b = b * 37 + (k[keylen]|0x20); } h[0] = a; h[1] = b; return (g[h[0] % 881] + g[h[1] % 881]) % 440; } #ifdef TEST_DRIVER #include <stdio.h> #include <string.h> int main(void) { size_t len; char f[50]; while (fgets(f, sizeof(f), stdin) != NULL) { len = strlen(f); if (len && f[len - 1] == '\n') f[--len] = '\0'; printf("%u %s\n", (unsigned int)hash(f, len), f); } return 0; } #endif