I wrote a little C extension to test this. It contains a function, which runs pglz_compress() on a bytea input, N times. I ran that with different kinds of inputs, and got the following results:
unpatched:
testname | auto
-------------------+-----------
5k text | 1425.750
512b text | 1287.413
2k random | -1053.160
100k random | -1056.379
512b random | -1018.474
64b of text | -2547.106
64b random | -3731.700
100k of same byte | 1093.885
100k text | 849.430
(9 rows)
pglz-replace-pointers-with-indexes.patch (the patch I posted earlier):
testname | auto
-------------------+-----------
5k text | 1251.576
512b text | 946.348
2k random | -815.095
100k random | -808.356
512b random | -614.418
64b of text | -744.382
64b random | -1060.758
100k of same byte | 1192.397
100k text | 796.530
(9 rows)
pglz-variable-size-hash-table.patch:
testname | auto
-------------------+----------
5k text | 1276.905
512b text | 823.796
2k random | -844.484
100k random | -848.080
512b random | -615.039
64b of text | -393.448
64b random | -568.685
100k of same byte | 1186.759
100k text | 799.978
(9 rows)
These values are from a single run of the test, but I did repeat them
several times to make sure there isn't too much variability in them to
render the results meaningless. The negative values mean that
pglz_compress gave up on the compression in the test, ie. it shows how
long it took for pglz_compress to realize that it can't compress the
input. Compare the abs() values.
With the variable-size hash table, I'm not sure how significant the earlier patch to use indexes instead of pointers is. But I don't think it makes things any worse, so it's included in this.
On 01.03.2013 17:42, Heikki Linnakangas wrote:
On 01.03.2013 17:37, Alvaro Herrera wrote:My take on this is that if this patch is necessary to get Amit's patch to a commitable state, it's fair game.I don't think it's necessary for that, but let's see..
With these tweaks, I was able to make pglz-based delta encoding perform roughly as well as Amit's patch. So I think that's the approach we should take, as it's a bit simpler and more versatile. I'll follow up on that in the other thread.
- Heikki
diff --git a/src/backend/utils/adt/pg_lzcompress.c b/src/backend/utils/adt/pg_lzcompress.c
index 66c64c1..b8a69ec 100644
--- a/src/backend/utils/adt/pg_lzcompress.c
+++ b/src/backend/utils/adt/pg_lzcompress.c
@@ -112,7 +112,7 @@
* of identical bytes like trailing spaces) and for bigger ones
* our 4K maximum look-back distance is too small.
*
- * The compressor creates a table for 8192 lists of positions.
+ * The compressor creates a table for lists of positions.
* For each input position (except the last 3), a hash key is
* built from the 4 next input bytes and the position remembered
* in the appropriate list. Thus, the table points to linked
@@ -120,7 +120,10 @@
* matching strings. This is done on the fly while the input
* is compressed into the output area. Table entries are only
* kept for the last 4096 input positions, since we cannot use
- * back-pointers larger than that anyway.
+ * back-pointers larger than that anyway. The size of the hash
+ * table depends on the size of the input - a larger table has
+ * a larger startup cost, as it needs to be initialized to zero,
+ * but reduces the number of hash collisions on long inputs.
*
* For each byte in the input, it's hash key (built from this
* byte and the next 3) is used to find the appropriate list
@@ -180,8 +183,7 @@
* Local definitions
* ----------
*/
-#define PGLZ_HISTORY_LISTS 8192 /* must be power of 2 */
-#define PGLZ_HISTORY_MASK (PGLZ_HISTORY_LISTS - 1)
+#define PGLZ_MAX_HISTORY_LISTS 8192 /* must be power of 2 */
#define PGLZ_HISTORY_SIZE 4096
#define PGLZ_MAX_MATCH 273
@@ -198,9 +200,9 @@
*/
typedef struct PGLZ_HistEntry
{
- struct PGLZ_HistEntry *next; /* links for my hash key's list */
- struct PGLZ_HistEntry *prev;
- int hindex; /* my current hash key */
+ int16 next; /* links for my hash key's list */
+ int16 prev;
+ uint32 hindex; /* my current hash key */
const char *pos; /* my input position */
} PGLZ_HistEntry;
@@ -241,9 +243,11 @@ const PGLZ_Strategy *const PGLZ_strategy_always = &strategy_always_data;
* Statically allocated work arrays for history
* ----------
*/
-static PGLZ_HistEntry *hist_start[PGLZ_HISTORY_LISTS];
-static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE];
+static int16 hist_start[PGLZ_MAX_HISTORY_LISTS];
+static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE + 1];
+/* Element 0 in hist_entries is unused, and means 'invalid'. */
+#define INVALID_ENTRY 0
/* ----------
* pglz_hist_idx -
@@ -257,10 +261,10 @@ static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE];
* hash keys more.
* ----------
*/
-#define pglz_hist_idx(_s,_e) ( \
+#define pglz_hist_idx(_s,_e, _mask) ( \
((((_e) - (_s)) < 4) ? (int) (_s)[0] : \
- (((_s)[0] << 9) ^ ((_s)[1] << 6) ^ \
- ((_s)[2] << 3) ^ (_s)[3])) & (PGLZ_HISTORY_MASK) \
+ (((_s)[0] << 6) ^ ((_s)[1] << 4) ^ \
+ ((_s)[2] << 2) ^ (_s)[3])) & (_mask) \
)
@@ -276,28 +280,28 @@ static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE];
* _hn and _recycle are modified in the macro.
* ----------
*/
-#define pglz_hist_add(_hs,_he,_hn,_recycle,_s,_e) \
+#define pglz_hist_add(_hs,_he,_hn,_recycle,_s,_e, _mask) \
do { \
- int __hindex = pglz_hist_idx((_s),(_e)); \
- PGLZ_HistEntry **__myhsp = &(_hs)[__hindex]; \
+ int __hindex = pglz_hist_idx((_s),(_e), (_mask)); \
+ int16 *__myhsp = &(_hs)[__hindex]; \
PGLZ_HistEntry *__myhe = &(_he)[_hn]; \
if (_recycle) { \
- if (__myhe->prev == NULL) \
+ if (__myhe->prev == INVALID_ENTRY) \
(_hs)[__myhe->hindex] = __myhe->next; \
else \
- __myhe->prev->next = __myhe->next; \
- if (__myhe->next != NULL) \
- __myhe->next->prev = __myhe->prev; \
+ (_he)[__myhe->prev].next = __myhe->next; \
+ if (__myhe->next != INVALID_ENTRY) \
+ (_he)[__myhe->next].prev = __myhe->prev; \
} \
__myhe->next = *__myhsp; \
- __myhe->prev = NULL; \
+ __myhe->prev = INVALID_ENTRY; \
__myhe->hindex = __hindex; \
__myhe->pos = (_s); \
- if (*__myhsp != NULL) \
- (*__myhsp)->prev = __myhe; \
- *__myhsp = __myhe; \
- if (++(_hn) >= PGLZ_HISTORY_SIZE) { \
- (_hn) = 0; \
+ if (*__myhsp != INVALID_ENTRY) \
+ (_he)[(*__myhsp)].prev = _hn; \
+ *__myhsp = _hn; \
+ if (++(_hn) >= PGLZ_HISTORY_SIZE + 1) { \
+ (_hn) = 1; \
(_recycle) = true; \
} \
} while (0)
@@ -372,19 +376,20 @@ do { \
* ----------
*/
static inline int
-pglz_find_match(PGLZ_HistEntry **hstart, const char *input, const char *end,
- int *lenp, int *offp, int good_match, int good_drop)
+pglz_find_match(int16 *hstart, const char *input, const char *end,
+ int *lenp, int *offp, int good_match, int good_drop, int mask)
{
- PGLZ_HistEntry *hent;
+ int16 hentno;
int32 len = 0;
int32 off = 0;
/*
* Traverse the linked history list until a good enough match is found.
*/
- hent = hstart[pglz_hist_idx(input, end)];
- while (hent)
+ hentno = hstart[pglz_hist_idx(input, end, mask)];
+ while (hentno != INVALID_ENTRY)
{
+ PGLZ_HistEntry *hent = &hist_entries[hentno];
const char *ip = input;
const char *hp = hent->pos;
int32 thisoff;
@@ -443,13 +448,13 @@ pglz_find_match(PGLZ_HistEntry **hstart, const char *input, const char *end,
/*
* Advance to the next history entry
*/
- hent = hent->next;
+ hentno = hent->next;
/*
* Be happy with lesser good matches the more entries we visited. But
* no point in doing calculation if we're at end of list.
*/
- if (hent)
+ if (hentno != INVALID_ENTRY)
{
if (len >= good_match)
break;
@@ -484,7 +489,7 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
{
unsigned char *bp = ((unsigned char *) dest) + sizeof(PGLZ_Header);
unsigned char *bstart = bp;
- int hist_next = 0;
+ int hist_next = 1;
bool hist_recycle = false;
const char *dp = source;
const char *dend = source + slen;
@@ -500,6 +505,8 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
int32 result_size;
int32 result_max;
int32 need_rate;
+ int hashsz;
+ int mask;
/*
* Our fallback strategy is the default.
@@ -556,10 +563,28 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
result_max = (slen * (100 - need_rate)) / 100;
/*
+ * Experiments suggest that these hash sizes work pretty well. A large
+ * hash table minimizes collision, but has a higher startup cost. For
+ * a small input, the startup cost dominates. The table size must be
+ * a power of two.
+ */
+ if (slen < 128)
+ hashsz = 512;
+ else if (slen < 256)
+ hashsz = 1024;
+ else if (slen < 512)
+ hashsz = 2048;
+ else if (slen < 1024)
+ hashsz = 4096;
+ else
+ hashsz = 8192;
+ mask = hashsz - 1;
+
+ /*
* Initialize the history lists to empty. We do not need to zero the
* hist_entries[] array; its entries are initialized as they are used.
*/
- memset(hist_start, 0, sizeof(hist_start));
+ memset(hist_start, 0, hashsz * sizeof(int16));
/*
* Compress the source directly into the output buffer.
@@ -589,7 +614,7 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
* Try to find a match in the history
*/
if (pglz_find_match(hist_start, dp, dend, &match_len,
- &match_off, good_match, good_drop))
+ &match_off, good_match, good_drop, mask))
{
/*
* Create the tag and add history entries for all matched
@@ -600,7 +625,7 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
{
pglz_hist_add(hist_start, hist_entries,
hist_next, hist_recycle,
- dp, dend);
+ dp, dend, mask);
dp++; /* Do not do this ++ in the line above! */
/* The macro would do it four times - Jan. */
}
@@ -614,7 +639,7 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
pglz_out_literal(ctrlp, ctrlb, ctrl, bp, *dp);
pglz_hist_add(hist_start, hist_entries,
hist_next, hist_recycle,
- dp, dend);
+ dp, dend, mask);
dp++; /* Do not do this ++ in the line above! */
/* The macro would do it four times - Jan. */
}
compresstest.tar.gz
Description: GNU Zip compressed data
-- Sent via pgsql-hackers mailing list ([email protected]) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
