I spent some more time on this, and came up with the attached patch. It includes the changes I posted earlier, to use indexes instead of pointers in the hash table. In addition, it makes the hash table size variable, depending on the length of the input. This further reduces the startup cost on small inputs. I changed the hash method slightly, because the old method would not use any bits from the 3rd byte with a small hash table size, but fortunately that didn't seem to negative impact with larger hash table sizes either.

I wrote a little C extension to test this. It contains a function, which runs pglz_compress() on a bytea input, N times. I ran that with different kinds of inputs, and got the following results:

unpatched:

     testname      |   auto
-------------------+-----------
 5k text           |  1425.750
 512b text         |  1287.413
 2k random         | -1053.160
 100k random       | -1056.379
 512b random       | -1018.474
 64b of text       | -2547.106
 64b random        | -3731.700
 100k of same byte |  1093.885
 100k text         |   849.430
(9 rows)

pglz-replace-pointers-with-indexes.patch (the patch I posted earlier):

    testname      |   auto
-------------------+-----------
 5k text           |  1251.576
 512b text         |   946.348
 2k random         |  -815.095
 100k random       |  -808.356
 512b random       |  -614.418
 64b of text       |  -744.382
 64b random        | -1060.758
 100k of same byte |  1192.397
 100k text         |   796.530
(9 rows)

pglz-variable-size-hash-table.patch:

     testname      |  auto
-------------------+----------
 5k text           | 1276.905
 512b text         |  823.796
 2k random         | -844.484
 100k random       | -848.080
 512b random       | -615.039
 64b of text       | -393.448
 64b random        | -568.685
 100k of same byte | 1186.759
 100k text         |  799.978
(9 rows)

These values are from a single run of the test, but I did repeat them several times to make sure there isn't too much variability in them to render the results meaningless. The negative values mean that pglz_compress gave up on the compression in the test, ie. it shows how long it took for pglz_compress to realize that it can't compress the input. Compare the abs() values.

With the variable-size hash table, I'm not sure how significant the earlier patch to use indexes instead of pointers is. But I don't think it makes things any worse, so it's included in this.

On 01.03.2013 17:42, Heikki Linnakangas wrote:
On 01.03.2013 17:37, Alvaro Herrera wrote:
My take on this is that if this patch is necessary to get Amit's patch
to a commitable state, it's fair game.

I don't think it's necessary for that, but let's see..

With these tweaks, I was able to make pglz-based delta encoding perform roughly as well as Amit's patch. So I think that's the approach we should take, as it's a bit simpler and more versatile. I'll follow up on that in the other thread.

- Heikki
diff --git a/src/backend/utils/adt/pg_lzcompress.c b/src/backend/utils/adt/pg_lzcompress.c
index 66c64c1..b8a69ec 100644
--- a/src/backend/utils/adt/pg_lzcompress.c
+++ b/src/backend/utils/adt/pg_lzcompress.c
@@ -112,7 +112,7 @@
  *			of identical bytes like trailing spaces) and for bigger ones
  *			our 4K maximum look-back distance is too small.
  *
- *			The compressor creates a table for 8192 lists of positions.
+ *			The compressor creates a table for lists of positions.
  *			For each input position (except the last 3), a hash key is
  *			built from the 4 next input bytes and the position remembered
  *			in the appropriate list. Thus, the table points to linked
@@ -120,7 +120,10 @@
  *			matching strings. This is done on the fly while the input
  *			is compressed into the output area.  Table entries are only
  *			kept for the last 4096 input positions, since we cannot use
- *			back-pointers larger than that anyway.
+ *			back-pointers larger than that anyway.  The size of the hash
+ *			table depends on the size of the input - a larger table has
+ *			a larger startup cost, as it needs to be initialized to zero,
+ *			but reduces the number of hash collisions on long inputs.
  *
  *			For each byte in the input, it's hash key (built from this
  *			byte and the next 3) is used to find the appropriate list
@@ -180,8 +183,7 @@
  * Local definitions
  * ----------
  */
-#define PGLZ_HISTORY_LISTS		8192	/* must be power of 2 */
-#define PGLZ_HISTORY_MASK		(PGLZ_HISTORY_LISTS - 1)
+#define PGLZ_MAX_HISTORY_LISTS	8192	/* must be power of 2 */
 #define PGLZ_HISTORY_SIZE		4096
 #define PGLZ_MAX_MATCH			273
 
@@ -198,9 +200,9 @@
  */
 typedef struct PGLZ_HistEntry
 {
-	struct PGLZ_HistEntry *next;	/* links for my hash key's list */
-	struct PGLZ_HistEntry *prev;
-	int			hindex;			/* my current hash key */
+	int16		next;			/* links for my hash key's list */
+	int16		prev;
+	uint32		hindex;			/* my current hash key */
 	const char *pos;			/* my input position */
 } PGLZ_HistEntry;
 
@@ -241,9 +243,11 @@ const PGLZ_Strategy *const PGLZ_strategy_always = &strategy_always_data;
  * Statically allocated work arrays for history
  * ----------
  */
-static PGLZ_HistEntry *hist_start[PGLZ_HISTORY_LISTS];
-static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE];
+static int16 hist_start[PGLZ_MAX_HISTORY_LISTS];
+static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE + 1];
 
+/* Element 0 in hist_entries is unused, and means 'invalid'. */
+#define INVALID_ENTRY			0
 
 /* ----------
  * pglz_hist_idx -
@@ -257,10 +261,10 @@ static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE];
  * hash keys more.
  * ----------
  */
-#define pglz_hist_idx(_s,_e) (												\
+#define pglz_hist_idx(_s,_e, _mask) (										\
 			((((_e) - (_s)) < 4) ? (int) (_s)[0] :							\
-			 (((_s)[0] << 9) ^ ((_s)[1] << 6) ^								\
-			  ((_s)[2] << 3) ^ (_s)[3])) & (PGLZ_HISTORY_MASK)				\
+			 (((_s)[0] << 6) ^ ((_s)[1] << 4) ^								\
+			  ((_s)[2] << 2) ^ (_s)[3])) & (_mask)				\
 		)
 
 
@@ -276,28 +280,28 @@ static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE];
  * _hn and _recycle are modified in the macro.
  * ----------
  */
-#define pglz_hist_add(_hs,_he,_hn,_recycle,_s,_e) \
+#define pglz_hist_add(_hs,_he,_hn,_recycle,_s,_e, _mask)	\
 do {									\
-			int __hindex = pglz_hist_idx((_s),(_e));						\
-			PGLZ_HistEntry **__myhsp = &(_hs)[__hindex];					\
+			int __hindex = pglz_hist_idx((_s),(_e), (_mask));				\
+			int16 *__myhsp = &(_hs)[__hindex];								\
 			PGLZ_HistEntry *__myhe = &(_he)[_hn];							\
 			if (_recycle) {													\
-				if (__myhe->prev == NULL)									\
+				if (__myhe->prev == INVALID_ENTRY)							\
 					(_hs)[__myhe->hindex] = __myhe->next;					\
 				else														\
-					__myhe->prev->next = __myhe->next;						\
-				if (__myhe->next != NULL)									\
-					__myhe->next->prev = __myhe->prev;						\
+					(_he)[__myhe->prev].next = __myhe->next;				\
+				if (__myhe->next != INVALID_ENTRY)							\
+					(_he)[__myhe->next].prev = __myhe->prev;				\
 			}																\
 			__myhe->next = *__myhsp;										\
-			__myhe->prev = NULL;											\
+			__myhe->prev = INVALID_ENTRY;									\
 			__myhe->hindex = __hindex;										\
 			__myhe->pos  = (_s);											\
-			if (*__myhsp != NULL)											\
-				(*__myhsp)->prev = __myhe;									\
-			*__myhsp = __myhe;												\
-			if (++(_hn) >= PGLZ_HISTORY_SIZE) {								\
-				(_hn) = 0;													\
+			if (*__myhsp != INVALID_ENTRY)									\
+				(_he)[(*__myhsp)].prev = _hn;								\
+			*__myhsp = _hn;													\
+			if (++(_hn) >= PGLZ_HISTORY_SIZE + 1) {							\
+				(_hn) = 1;													\
 				(_recycle) = true;											\
 			}																\
 } while (0)
@@ -372,19 +376,20 @@ do { \
  * ----------
  */
 static inline int
-pglz_find_match(PGLZ_HistEntry **hstart, const char *input, const char *end,
-				int *lenp, int *offp, int good_match, int good_drop)
+pglz_find_match(int16 *hstart, const char *input, const char *end,
+				int *lenp, int *offp, int good_match, int good_drop, int mask)
 {
-	PGLZ_HistEntry *hent;
+	int16		hentno;
 	int32		len = 0;
 	int32		off = 0;
 
 	/*
 	 * Traverse the linked history list until a good enough match is found.
 	 */
-	hent = hstart[pglz_hist_idx(input, end)];
-	while (hent)
+	hentno = hstart[pglz_hist_idx(input, end, mask)];
+	while (hentno != INVALID_ENTRY)
 	{
+		PGLZ_HistEntry *hent = &hist_entries[hentno];
 		const char *ip = input;
 		const char *hp = hent->pos;
 		int32		thisoff;
@@ -443,13 +448,13 @@ pglz_find_match(PGLZ_HistEntry **hstart, const char *input, const char *end,
 		/*
 		 * Advance to the next history entry
 		 */
-		hent = hent->next;
+		hentno = hent->next;
 
 		/*
 		 * Be happy with lesser good matches the more entries we visited. But
 		 * no point in doing calculation if we're at end of list.
 		 */
-		if (hent)
+		if (hentno != INVALID_ENTRY)
 		{
 			if (len >= good_match)
 				break;
@@ -484,7 +489,7 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
 {
 	unsigned char *bp = ((unsigned char *) dest) + sizeof(PGLZ_Header);
 	unsigned char *bstart = bp;
-	int			hist_next = 0;
+	int			hist_next = 1;
 	bool		hist_recycle = false;
 	const char *dp = source;
 	const char *dend = source + slen;
@@ -500,6 +505,8 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
 	int32		result_size;
 	int32		result_max;
 	int32		need_rate;
+	int			hashsz;
+	int			mask;
 
 	/*
 	 * Our fallback strategy is the default.
@@ -556,10 +563,28 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
 		result_max = (slen * (100 - need_rate)) / 100;
 
 	/*
+	 * Experiments suggest that these hash sizes work pretty well. A large
+	 * hash table minimizes collision, but has a higher startup cost. For
+	 * a small input, the startup cost dominates. The table size must be
+	 * a power of two.
+	 */
+	if (slen < 128)
+		hashsz = 512;
+	else if (slen < 256)
+		hashsz = 1024;
+	else if (slen < 512)
+		hashsz = 2048;
+	else if (slen < 1024)
+		hashsz = 4096;
+	else
+		hashsz = 8192;
+	mask = hashsz - 1;
+
+	/*
 	 * Initialize the history lists to empty.  We do not need to zero the
 	 * hist_entries[] array; its entries are initialized as they are used.
 	 */
-	memset(hist_start, 0, sizeof(hist_start));
+	memset(hist_start, 0, hashsz * sizeof(int16));
 
 	/*
 	 * Compress the source directly into the output buffer.
@@ -589,7 +614,7 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
 		 * Try to find a match in the history
 		 */
 		if (pglz_find_match(hist_start, dp, dend, &match_len,
-							&match_off, good_match, good_drop))
+							&match_off, good_match, good_drop, mask))
 		{
 			/*
 			 * Create the tag and add history entries for all matched
@@ -600,7 +625,7 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
 			{
 				pglz_hist_add(hist_start, hist_entries,
 							  hist_next, hist_recycle,
-							  dp, dend);
+							  dp, dend, mask);
 				dp++;			/* Do not do this ++ in the line above! */
 				/* The macro would do it four times - Jan.	*/
 			}
@@ -614,7 +639,7 @@ pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
 			pglz_out_literal(ctrlp, ctrlb, ctrl, bp, *dp);
 			pglz_hist_add(hist_start, hist_entries,
 						  hist_next, hist_recycle,
-						  dp, dend);
+						  dp, dend, mask);
 			dp++;				/* Do not do this ++ in the line above! */
 			/* The macro would do it four times - Jan.	*/
 		}

Attachment: compresstest.tar.gz
Description: GNU Zip compressed data

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to