On Sun, Dec 16, 2018 at 11:50:15AM -0500, John Naylor wrote:
> A few months ago I was looking into faster search algorithms for
> ScanKeywordLookup(), so this is interesting to me. While an optimal
> full replacement would be a lot of work, the above ideas are much less
> invasive and would still have some benefit. Unless anyone intends to
> work on this, I'd like to flesh out the offset-into-giant-string
> approach a bit further:

Hello John,
I was pointed at your patch on IRC and decided to look into adding my
own pieces. What I can provide you is a fast perfect hash function
generator.  I've attached a sample hash function based on the current
main keyword list. hash() essentially gives you the number of the only
possible match, a final strcmp/memcmp is still necessary to verify that
it is an actual keyword though. The |0x20 can be dropped if all cases
have pre-lower-cased the input already. This would replace the binary
search in the lookup functions. Returning offsets directly would be easy
as well. That allows writing a single string where each entry is prefixed
with a type mask, the token id, the length of the keyword and the actual
keyword text. Does that sound useful to you?

Joerg
#include <stddef.h>
#include <inttypes.h>

uint32_t
hash(const void * __restrict key, size_t keylen)
{
        static const uint16_t g[881] = {
            0x015b, 0x0000, 0x0070, 0x01b2, 0x0000, 0x0078, 0x0020, 0x0000,
            0x0000, 0x0193, 0x0000, 0x0000, 0x0000, 0x01ac, 0x0000, 0x0122,
            0x00b9, 0x0176, 0x013b, 0x0000, 0x0000, 0x0000, 0x0150, 0x0000,
            0x0000, 0x0000, 0x008b, 0x00ea, 0x00b3, 0x0197, 0x0000, 0x0118,
            0x012d, 0x0102, 0x0000, 0x0091, 0x0061, 0x008c, 0x0000, 0x0000,
            0x0144, 0x01b4, 0x0000, 0x0000, 0x01a8, 0x019e, 0x0000, 0x00da,
            0x0000, 0x0000, 0x0122, 0x0176, 0x00f3, 0x016a, 0x00f4, 0x00c0,
            0x0111, 0x0000, 0x0103, 0x0028, 0x001a, 0x0180, 0x0000, 0x0000,
            0x005f, 0x0000, 0x00d9, 0x0000, 0x016d, 0x0000, 0x0170, 0x0007,
            0x016f, 0x0000, 0x014e, 0x0098, 0x00a8, 0x004b, 0x0000, 0x0056,
            0x0000, 0x0121, 0x0012, 0x0102, 0x0192, 0x0000, 0x00f2, 0x0066,
            0x0000, 0x003a, 0x0000, 0x0000, 0x0144, 0x0000, 0x0000, 0x0133,
            0x0067, 0x0169, 0x0000, 0x0000, 0x0152, 0x0122, 0x0000, 0x0058,
            0x0135, 0x0045, 0x0193, 0x00d2, 0x007e, 0x0000, 0x00ae, 0x012c,
            0x0000, 0x0000, 0x0000, 0x0000, 0x0124, 0x0000, 0x0046, 0x0018,
            0x0000, 0x00ba, 0x00d1, 0x004a, 0x0000, 0x0000, 0x0000, 0x0000,
            0x0000, 0x001f, 0x0000, 0x0101, 0x0000, 0x0000, 0x0000, 0x01b5,
            0x016e, 0x0173, 0x008a, 0x0000, 0x0173, 0x000b, 0x0000, 0x00d5,
            0x005e, 0x0000, 0x00ac, 0x0000, 0x0000, 0x0000, 0x01a1, 0x0000,
            0x0000, 0x0127, 0x0000, 0x005e, 0x0000, 0x016f, 0x0000, 0x012b,
            0x01a4, 0x01b4, 0x0000, 0x0000, 0x003a, 0x0000, 0x0000, 0x00f5,
            0x00b1, 0x0003, 0x0123, 0x001b, 0x0000, 0x004f, 0x0000, 0x0000,
            0x0000, 0x0000, 0x0000, 0x007a, 0x0000, 0x0000, 0x0000, 0x0000,
            0x00c2, 0x00a2, 0x00b9, 0x0000, 0x00cb, 0x0000, 0x00d2, 0x0000,
            0x0197, 0x0121, 0x0000, 0x00d6, 0x0107, 0x0000, 0x0000, 0x0000,
            0x0000, 0x0000, 0x0000, 0x0165, 0x00df, 0x0121, 0x0000, 0x0000,
            0x0000, 0x0000, 0x0000, 0x019b, 0x0000, 0x01ad, 0x0000, 0x014f,
            0x018d, 0x0000, 0x015f, 0x0168, 0x0000, 0x0199, 0x0000, 0x0000,
            0x0000, 0x00a1, 0x0000, 0x0000, 0x0109, 0x0000, 0x0000, 0x01a6,
            0x0097, 0x0000, 0x0018, 0x0000, 0x00d1, 0x0000, 0x0000, 0x0000,
            0x0187, 0x0018, 0x0000, 0x00aa, 0x0000, 0x0000, 0x0000, 0x0000,
            0x0136, 0x0063, 0x00b8, 0x0000, 0x0067, 0x0114, 0x0000, 0x0000,
            0x0151, 0x0000, 0x0000, 0x0000, 0x00bf, 0x0000, 0x0000, 0x0000,
            0x01b4, 0x00d4, 0x0000, 0x0006, 0x017e, 0x0167, 0x003a, 0x017f,
            0x0183, 0x00c9, 0x01a2, 0x0000, 0x0000, 0x0153, 0x00ce, 0x0000,
            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0051, 0x0000, 0x0086,
            0x0000, 0x0083, 0x0137, 0x0000, 0x0000, 0x0050, 0x0000, 0x00d7,
            0x0000, 0x0000, 0x0000, 0x0129, 0x00f1, 0x0000, 0x009b, 0x01a7,
            0x0000, 0x00b4, 0x0000, 0x00e0, 0x0046, 0x0025, 0x0000, 0x0000,
            0x0000, 0x0144, 0x0000, 0x01a5, 0x0044, 0x0096, 0x0078, 0x0166,
            0x0000, 0x0000, 0x0000, 0x0143, 0x0000, 0x00b8, 0x0000, 0x009e,
            0x0000, 0x008c, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x00fe,
            0x0000, 0x0000, 0x0037, 0x0057, 0x0000, 0x00c3, 0x0000, 0x0000,
            0x0000, 0x00bf, 0x014b, 0x0069, 0x00ce, 0x0000, 0x019d, 0x007f,
            0x0186, 0x0000, 0x0119, 0x0015, 0x0000, 0x000e, 0x0113, 0x0139,
            0x008e, 0x01ab, 0x0000, 0x005c, 0x0000, 0x0095, 0x0000, 0x019d,
            0x0000, 0x0195, 0x0036, 0x0000, 0x0000, 0x00e0, 0x0146, 0x0000,
            0x0033, 0x0000, 0x0000, 0x0035, 0x0000, 0x0000, 0x0000, 0x0000,
            0x00d2, 0x0000, 0x0000, 0x0000, 0x0000, 0x004c, 0x00f0, 0x0000,
            0x0119, 0x00bd, 0x0000, 0x0000, 0x0031, 0x0117, 0x00b4, 0x0000,
            0x00f8, 0x0000, 0x0055, 0x0000, 0x0170, 0x0000, 0x0000, 0x0000,
            0x00e4, 0x00b5, 0x01b5, 0x0024, 0x0000, 0x01a5, 0x0000, 0x0000,
            0x0000, 0x0000, 0x0151, 0x0000, 0x00cc, 0x0000, 0x0000, 0x0150,
            0x00f3, 0x0071, 0x00d0, 0x0085, 0x0140, 0x0000, 0x00ae, 0x0000,
            0x00c4, 0x01a8, 0x0000, 0x0091, 0x0180, 0x0057, 0x0072, 0x0000,
            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x002a, 0x0000,
            0x0112, 0x003d, 0x017f, 0x0088, 0x0000, 0x0158, 0x0046, 0x0101,
            0x0000, 0x0000, 0x00ea, 0x0000, 0x0000, 0x00b2, 0x0149, 0x0000,
            0x007c, 0x0107, 0x0161, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
            0x0000, 0x0169, 0x0000, 0x0118, 0x0091, 0x0043, 0x0064, 0x0000,
            0x0000, 0x0194, 0x0000, 0x0000, 0x00e9, 0x0000, 0x0000, 0x0000,
            0x005e, 0x0000, 0x0029, 0x0000, 0x0000, 0x0000, 0x003c, 0x0000,
            0x0000, 0x008b, 0x0000, 0x0000, 0x00fd, 0x002d, 0x0184, 0x0000,
            0x0000, 0x016a, 0x006f, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
            0x0000, 0x0000, 0x0000, 0x0000, 0x01a0, 0x0000, 0x003b, 0x0000,
            0x006d, 0x0000, 0x016a, 0x0000, 0x01b2, 0x00c9, 0x0094, 0x0181,
            0x018b, 0x0000, 0x0199, 0x00c7, 0x017e, 0x0000, 0x0000, 0x0160,
            0x0000, 0x0000, 0x0175, 0x0000, 0x0000, 0x006a, 0x008d, 0x0000,
            0x00ed, 0x0000, 0x00b7, 0x0000, 0x0000, 0x0107, 0x00f9, 0x0000,
            0x0173, 0x0137, 0x0000, 0x0185, 0x0114, 0x006c, 0x0000, 0x0000,
            0x00f4, 0x0189, 0x0000, 0x0000, 0x0102, 0x0000, 0x00d5, 0x0000,
            0x0000, 0x015a, 0x0000, 0x00de, 0x0000, 0x0000, 0x0000, 0x0000,
            0x0000, 0x0000, 0x0000, 0x00f0, 0x00e1, 0x0000, 0x0000, 0x01b6,
            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0197, 0x0000, 0x0154,
            0x004a, 0x018d, 0x0000, 0x00c3, 0x0000, 0x0000, 0x0000, 0x0000,
            0x00c1, 0x0189, 0x001c, 0x0000, 0x00a6, 0x0000, 0x0000, 0x00a4,
            0x0000, 0x0000, 0x0000, 0x00ed, 0x0000, 0x0173, 0x0169, 0x00d2,
            0x0117, 0x0000, 0x009b, 0x0000, 0x014e, 0x0000, 0x00ac, 0x0000,
            0x008e, 0x0121, 0x0104, 0x0179, 0x0000, 0x01a5, 0x0103, 0x0000,
            0x001b, 0x0000, 0x0000, 0x01a8, 0x00ba, 0x010c, 0x0000, 0x0000,
            0x010e, 0x00ab, 0x0062, 0x0000, 0x0000, 0x0154, 0x0122, 0x013f,
            0x015f, 0x0000, 0x00f5, 0x01a9, 0x017b, 0x01a4, 0x0000, 0x0000,
            0x0040, 0x0004, 0x019b, 0x0000, 0x00e3, 0x010d, 0x015b, 0x0000,
            0x0104, 0x0000, 0x0000, 0x00b2, 0x00e8, 0x0000, 0x0000, 0x0000,
            0x0065, 0x0062, 0x007a, 0x0000, 0x0065, 0x008d, 0x0000, 0x0085,
            0x0000, 0x0000, 0x0000, 0x00af, 0x0104, 0x01ab, 0x0040, 0x00a8,
            0x0000, 0x0000, 0x0000, 0x0159, 0x0000, 0x0000, 0x0000, 0x0000,
            0x0000, 0x0189, 0x017b, 0x0077, 0x0000, 0x00ea, 0x00d7, 0x007f,
            0x0000, 0x00ae, 0x0047, 0x0000, 0x0163, 0x0000, 0x0000, 0x0157,
            0x0178, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x014d, 0x0009,
            0x0000, 0x0000, 0x00b6, 0x0000, 0x0000, 0x0000, 0x0000, 0x0192,
            0x01b1, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
            0x0053, 0x012b, 0x00f6, 0x0096, 0x0000, 0x0141, 0x0000, 0x0000,
            0x0026, 0x0044, 0x00ce, 0x0061, 0x0199, 0x0000, 0x016b, 0x0156,
            0x011d, 0x0000, 0x0038, 0x008c, 0x00c8, 0x0000, 0x0000, 0x0002,
            0x0000, 0x01a1, 0x0000, 0x001e, 0x0000, 0x0000, 0x00bc, 0x00ab,
            0x0000, 0x0183, 0x0085, 0x0000, 0x0000, 0x010c, 0x0000, 0x01a5,
            0x0120, 0x0000, 0x0000, 0x0000, 0x0000, 0x0135, 0x0079, 0x0000,
            0x01ae, 0x0028, 0x0000, 0x0000, 0x014a, 0x0000, 0x00dd, 0x0000,
            0x0000, 0x00d8, 0x00de, 0x0075, 0x0000, 0x0000, 0x0021, 0x0099,
            0x0000, 0x00f7, 0x0000, 0x0000, 0x0000, 0x0046, 0x0000, 0x0010,
            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0116,
            0x0000, 0x0000, 0x0000, 0x0000, 0x01a8, 0x0000, 0x0000, 0x0000,
            0x004c, 0x0000, 0x00b7, 0x0000, 0x013f, 0x003c, 0x0000, 0x0000,
            0x006d, 0x007f, 0x0181, 0x0000, 0x0013, 0x0000, 0x0180, 0x0000,
            0x0000, 0x0000, 0x0000, 0x0000, 0x0136, 0x000d, 0x0000, 0x0000,
            0x0000, 0x0082, 0x0000, 0x0000, 0x00cf, 0x00c3, 0x0000, 0x0000,
            0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0127,
            0x0000, 0x0013, 0x019c, 0x0000, 0x0024, 0x00bd, 0x017e, 0x0000,
            0x00b8, 0x002e, 0x012c, 0x0007, 0x0000, 0x0000, 0x00f7, 0x0000,
            0x0048,
        };
        uint32_t h[2];

        const unsigned char *k = key;
        uint32_t a, b;
        a = b = 0x0U;
        while (keylen--) {
                a = a * 31 + (k[keylen]|0x20);
                b = b * 37 + (k[keylen]|0x20);
        }
        h[0] = a;
        h[1] = b;
        return (g[h[0] % 881] + g[h[1] % 881]) % 440;
}

#ifdef TEST_DRIVER
#include <stdio.h>
#include <string.h>

int main(void)
{
        size_t len;
        char f[50];
        while (fgets(f, sizeof(f), stdin) != NULL) {
                len = strlen(f);
                if (len && f[len - 1] == '\n')
                        f[--len] = '\0';
                printf("%u %s\n", (unsigned int)hash(f, len), f);
        }
        return 0;
}
#endif

Reply via email to