I wrote a little C extension to test this. It contains a function, which runs pglz_compress() on a bytea input, N times. I ran that with different kinds of inputs, and got the following results:
unpatched: testname | auto -------------------+----------- 5k text | 1425.750 512b text | 1287.413 2k random | -1053.160 100k random | -1056.379 512b random | -1018.474 64b of text | -2547.106 64b random | -3731.700 100k of same byte | 1093.885 100k text | 849.430 (9 rows) pglz-replace-pointers-with-indexes.patch (the patch I posted earlier): testname | auto -------------------+----------- 5k text | 1251.576 512b text | 946.348 2k random | -815.095 100k random | -808.356 512b random | -614.418 64b of text | -744.382 64b random | -1060.758 100k of same byte | 1192.397 100k text | 796.530 (9 rows) pglz-variable-size-hash-table.patch: testname | auto -------------------+---------- 5k text | 1276.905 512b text | 823.796 2k random | -844.484 100k random | -848.080 512b random | -615.039 64b of text | -393.448 64b random | -568.685 100k of same byte | 1186.759 100k text | 799.978 (9 rows)These values are from a single run of the test, but I did repeat them several times to make sure there isn't too much variability in them to render the results meaningless. The negative values mean that pglz_compress gave up on the compression in the test, ie. it shows how long it took for pglz_compress to realize that it can't compress the input. Compare the abs() values.
With the variable-size hash table, I'm not sure how significant the earlier patch to use indexes instead of pointers is. But I don't think it makes things any worse, so it's included in this.
On 01.03.2013 17:42, Heikki Linnakangas wrote:
On 01.03.2013 17:37, Alvaro Herrera wrote:My take on this is that if this patch is necessary to get Amit's patch to a commitable state, it's fair game.I don't think it's necessary for that, but let's see..
With these tweaks, I was able to make pglz-based delta encoding perform roughly as well as Amit's patch. So I think that's the approach we should take, as it's a bit simpler and more versatile. I'll follow up on that in the other thread.
- Heikki
diff --git a/src/backend/utils/adt/pg_lzcompress.c b/src/backend/utils/adt/pg_lzcompress.c index 66c64c1..b8a69ec 100644 --- a/src/backend/utils/adt/pg_lzcompress.c +++ b/src/backend/utils/adt/pg_lzcompress.c @@ -112,7 +112,7 @@ * of identical bytes like trailing spaces) and for bigger ones * our 4K maximum look-back distance is too small. * - * The compressor creates a table for 8192 lists of positions. + * The compressor creates a table for lists of positions. * For each input position (except the last 3), a hash key is * built from the 4 next input bytes and the position remembered * in the appropriate list. Thus, the table points to linked @@ -120,7 +120,10 @@ * matching strings. This is done on the fly while the input * is compressed into the output area. Table entries are only * kept for the last 4096 input positions, since we cannot use - * back-pointers larger than that anyway. + * back-pointers larger than that anyway. The size of the hash + * table depends on the size of the input - a larger table has + * a larger startup cost, as it needs to be initialized to zero, + * but reduces the number of hash collisions on long inputs. * * For each byte in the input, it's hash key (built from this * byte and the next 3) is used to find the appropriate list @@ -180,8 +183,7 @@ * Local definitions * ---------- */ -#define PGLZ_HISTORY_LISTS 8192 /* must be power of 2 */ -#define PGLZ_HISTORY_MASK (PGLZ_HISTORY_LISTS - 1) +#define PGLZ_MAX_HISTORY_LISTS 8192 /* must be power of 2 */ #define PGLZ_HISTORY_SIZE 4096 #define PGLZ_MAX_MATCH 273 @@ -198,9 +200,9 @@ */ typedef struct PGLZ_HistEntry { - struct PGLZ_HistEntry *next; /* links for my hash key's list */ - struct PGLZ_HistEntry *prev; - int hindex; /* my current hash key */ + int16 next; /* links for my hash key's list */ + int16 prev; + uint32 hindex; /* my current hash key */ const char *pos; /* my input position */ } PGLZ_HistEntry; @@ -241,9 +243,11 @@ const PGLZ_Strategy *const PGLZ_strategy_always = &strategy_always_data; * Statically allocated work arrays for history * ---------- */ -static PGLZ_HistEntry *hist_start[PGLZ_HISTORY_LISTS]; -static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE]; +static int16 hist_start[PGLZ_MAX_HISTORY_LISTS]; +static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE + 1]; +/* Element 0 in hist_entries is unused, and means 'invalid'. */ +#define INVALID_ENTRY 0 /* ---------- * pglz_hist_idx - @@ -257,10 +261,10 @@ static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE]; * hash keys more. * ---------- */ -#define pglz_hist_idx(_s,_e) ( \ +#define pglz_hist_idx(_s,_e, _mask) ( \ ((((_e) - (_s)) < 4) ? (int) (_s)[0] : \ - (((_s)[0] << 9) ^ ((_s)[1] << 6) ^ \ - ((_s)[2] << 3) ^ (_s)[3])) & (PGLZ_HISTORY_MASK) \ + (((_s)[0] << 6) ^ ((_s)[1] << 4) ^ \ + ((_s)[2] << 2) ^ (_s)[3])) & (_mask) \ ) @@ -276,28 +280,28 @@ static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE]; * _hn and _recycle are modified in the macro. * ---------- */ -#define pglz_hist_add(_hs,_he,_hn,_recycle,_s,_e) \ +#define pglz_hist_add(_hs,_he,_hn,_recycle,_s,_e, _mask) \ do { \ - int __hindex = pglz_hist_idx((_s),(_e)); \ - PGLZ_HistEntry **__myhsp = &(_hs)[__hindex]; \ + int __hindex = pglz_hist_idx((_s),(_e), (_mask)); \ + int16 *__myhsp = &(_hs)[__hindex]; \ PGLZ_HistEntry *__myhe = &(_he)[_hn]; \ if (_recycle) { \ - if (__myhe->prev == NULL) \ + if (__myhe->prev == INVALID_ENTRY) \ (_hs)[__myhe->hindex] = __myhe->next; \ else \ - __myhe->prev->next = __myhe->next; \ - if (__myhe->next != NULL) \ - __myhe->next->prev = __myhe->prev; \ + (_he)[__myhe->prev].next = __myhe->next; \ + if (__myhe->next != INVALID_ENTRY) \ + (_he)[__myhe->next].prev = __myhe->prev; \ } \ __myhe->next = *__myhsp; \ - __myhe->prev = NULL; \ + __myhe->prev = INVALID_ENTRY; \ __myhe->hindex = __hindex; \ __myhe->pos = (_s); \ - if (*__myhsp != NULL) \ - (*__myhsp)->prev = __myhe; \ - *__myhsp = __myhe; \ - if (++(_hn) >= PGLZ_HISTORY_SIZE) { \ - (_hn) = 0; \ + if (*__myhsp != INVALID_ENTRY) \ + (_he)[(*__myhsp)].prev = _hn; \ + *__myhsp = _hn; \ + if (++(_hn) >= PGLZ_HISTORY_SIZE + 1) { \ + (_hn) = 1; \ (_recycle) = true; \ } \ } while (0) @@ -372,19 +376,20 @@ do { \ * ---------- */ static inline int -pglz_find_match(PGLZ_HistEntry **hstart, const char *input, const char *end, - int *lenp, int *offp, int good_match, int good_drop) +pglz_find_match(int16 *hstart, const char *input, const char *end, + int *lenp, int *offp, int good_match, int good_drop, int mask) { - PGLZ_HistEntry *hent; + int16 hentno; int32 len = 0; int32 off = 0; /* * Traverse the linked history list until a good enough match is found. */ - hent = hstart[pglz_hist_idx(input, end)]; - while (hent) + hentno = hstart[pglz_hist_idx(input, end, mask)]; + while (hentno != INVALID_ENTRY) { + PGLZ_HistEntry *hent = &hist_entries[hentno]; const char *ip = input; const char *hp = hent->pos; int32 thisoff; @@ -443,13 +448,13 @@ pglz_find_match(PGLZ_HistEntry **hstart, const char *input, const char *end, /* * Advance to the next history entry */ - hent = hent->next; + hentno = hent->next; /* * Be happy with lesser good matches the more entries we visited. But * no point in doing calculation if we're at end of list. */ - if (hent) + if (hentno != INVALID_ENTRY) { if (len >= good_match) break; @@ -484,7 +489,7 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, { unsigned char *bp = ((unsigned char *) dest) + sizeof(PGLZ_Header); unsigned char *bstart = bp; - int hist_next = 0; + int hist_next = 1; bool hist_recycle = false; const char *dp = source; const char *dend = source + slen; @@ -500,6 +505,8 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, int32 result_size; int32 result_max; int32 need_rate; + int hashsz; + int mask; /* * Our fallback strategy is the default. @@ -556,10 +563,28 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, result_max = (slen * (100 - need_rate)) / 100; /* + * Experiments suggest that these hash sizes work pretty well. A large + * hash table minimizes collision, but has a higher startup cost. For + * a small input, the startup cost dominates. The table size must be + * a power of two. + */ + if (slen < 128) + hashsz = 512; + else if (slen < 256) + hashsz = 1024; + else if (slen < 512) + hashsz = 2048; + else if (slen < 1024) + hashsz = 4096; + else + hashsz = 8192; + mask = hashsz - 1; + + /* * Initialize the history lists to empty. We do not need to zero the * hist_entries[] array; its entries are initialized as they are used. */ - memset(hist_start, 0, sizeof(hist_start)); + memset(hist_start, 0, hashsz * sizeof(int16)); /* * Compress the source directly into the output buffer. @@ -589,7 +614,7 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, * Try to find a match in the history */ if (pglz_find_match(hist_start, dp, dend, &match_len, - &match_off, good_match, good_drop)) + &match_off, good_match, good_drop, mask)) { /* * Create the tag and add history entries for all matched @@ -600,7 +625,7 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, { pglz_hist_add(hist_start, hist_entries, hist_next, hist_recycle, - dp, dend); + dp, dend, mask); dp++; /* Do not do this ++ in the line above! */ /* The macro would do it four times - Jan. */ } @@ -614,7 +639,7 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, pglz_out_literal(ctrlp, ctrlb, ctrl, bp, *dp); pglz_hist_add(hist_start, hist_entries, hist_next, hist_recycle, - dp, dend); + dp, dend, mask); dp++; /* Do not do this ++ in the line above! */ /* The macro would do it four times - Jan. */ }
compresstest.tar.gz
Description: GNU Zip compressed data
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers