Re: [HACKERS] Performance Improvement by reducing WAL for Update Operation

Heikki Linnakangas Wed, 05 Feb 2014 04:01:38 -0800

On 01/30/2014 08:53 AM, Amit Kapila wrote:

On Wed, Jan 29, 2014 at 8:13 PM, Heikki Linnakangas
<[email protected]> wrote:

On 01/29/2014 02:21 PM, Amit Kapila wrote:

The main reason to process in chunks as much as possible is to save
cpu cycles. For example if we build hash table byte-by-byte, then even
for best case where most of tuple has a match, it will have reasonable
overhead due to formation of hash table.


Hmm. One very simple optimization we could do is to just compare the two
strings byte by byte, before doing anything else, to find any common prefix
they might have. Then output a tag for the common prefix, and run the normal
algorithm on the rest of the strings. In many real-world tables, the 1-2
first columns are a key that never changes, so that might work pretty well
in practice. Maybe it would also be worthwhile to do the same for any common
suffix the tuples might have.


Is it possible to do for both prefix and suffix together, basically
the question I
have in mind is what will be deciding factor for switching from hash table
mechanism to string comparison mode for suffix. Do we switch when we find
long enough match?

I think you got it backwards. You don't switch from hash table mechanismto string comparison. You do the prefix/suffix comparison *first*, andrun the hash table algorithm only on the "middle" part, between thecommon prefix and suffix.

Can we do this optimization after the basic version is acceptable?

I would actually suggest doing that first. Perhaps even ditch the wholehistory table approach and do *only* the scan for prefix and suffix.That's very cheap, and already covers a large fraction of UPDATEs thatreal applications do. In particular, it's optimal for the case that youupdate only a single column, something like "UPDATE foo SET bar = bar + 1".

I'm pretty sure the overhead of that would be negligible, so we couldalways enable it. There are certainly a lot of scenarios whereprefix/suffix detection alone wouldn't help, but so what.


Attached is a quick patch for that, if you want to test it.

- Heikki

diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml
index e0b8a4e..c4ac2bd 100644
--- a/doc/src/sgml/ref/create_table.sgml
+++ b/doc/src/sgml/ref/create_table.sgml
@@ -1014,6 +1014,22 @@ CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXI
     </listitem>
    </varlistentry>
 
+   <varlistentry>
+    <term><literal>wal_compress_update</> (<type>boolean</>)</term>
+    <listitem>
+     <para>
+      Enables or disables the WAL tuple compression for <command>UPDATE</>
+      on this table.  Default value of this option is false to maintain
+      backward compatability for the command. If true, all the update
+      operations on this table which will place the new tuple on same page
+      as it's original tuple will compress the WAL for new tuple and
+      subsequently reduce the WAL volume.  It is recommended to enable
+      this option for tables where <command>UPDATE</> changes less than
+      50 percent of tuple data.
+     </para>
+     </listitem>
+    </varlistentry>
+
    </variablelist>
 
   </refsect2>
diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c
index aea9d40..3bf5728 100644
--- a/src/backend/access/common/heaptuple.c
+++ b/src/backend/access/common/heaptuple.c
@@ -60,6 +60,7 @@
 #include "access/sysattr.h"
 #include "access/tuptoaster.h"
 #include "executor/tuptable.h"
+#include "utils/pg_rbcompress.h"
 
 
 /* Does att's datatype allow packing into the 1-byte-header varlena format? */
@@ -617,6 +618,44 @@ heap_copytuple_with_tuple(HeapTuple src, HeapTuple dest)
 	memcpy((char *) dest->t_data, (char *) src->t_data, src->t_len);
 }
 
+/* ----------------
+ * heap_delta_encode
+ *
+ *		Calculate the delta between two tuples and generate
+ *  encoded wal tuple (EWT), using pgrb. The result is stored
+ *  in *encdata.
+ * ----------------
+ */
+bool
+heap_delta_encode(TupleDesc tupleDesc, HeapTuple oldtup, HeapTuple newtup,
+				  char *encdata, uint32 *enclen)
+{
+	return pgrb_delta_encode(
+		(char *) newtup->t_data + offsetof(HeapTupleHeaderData, t_bits),
+		newtup->t_len - offsetof(HeapTupleHeaderData, t_bits),
+		(char *) oldtup->t_data + offsetof(HeapTupleHeaderData, t_bits),
+		oldtup->t_len - offsetof(HeapTupleHeaderData, t_bits),
+		encdata, enclen, NULL
+		);
+}
+
+/* ----------------
+ * heap_delta_decode
+ *
+ *		Decode a tuple using delta-encoded WAL tuple and old tuple version.
+ * ----------------
+ */
+void
+heap_delta_decode(char *encdata, uint32 enclen, HeapTuple oldtup, HeapTuple newtup)
+{
+	pgrb_delta_decode(encdata, enclen,
+			 (char *) newtup->t_data + offsetof(HeapTupleHeaderData, t_bits),
+			 MaxHeapTupleSize - offsetof(HeapTupleHeaderData, t_bits),
+			 &newtup->t_len,
+			 (char *) oldtup->t_data + offsetof(HeapTupleHeaderData, t_bits),
+			 oldtup->t_len - offsetof(HeapTupleHeaderData, t_bits));
+}
+
 /*
  * heap_form_tuple
  *		construct a tuple from the given values[] and isnull[] arrays,
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index fa08c45..2123a61 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -85,6 +85,14 @@ static relopt_bool boolRelOpts[] =
 		},
 		false
 	},
+	{
+		{
+			"wal_compress_update",
+			"Compress the wal tuple for update operation on this relation",
+			RELOPT_KIND_HEAP
+		},
+		true
+	},
 	/* list terminator */
 	{{NULL}}
 };
@@ -1175,7 +1183,9 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind)
 		{"check_option", RELOPT_TYPE_STRING,
 		offsetof(StdRdOptions, check_option_offset)},
 		{"user_catalog_table", RELOPT_TYPE_BOOL,
-		 offsetof(StdRdOptions, user_catalog_table)}
+		 offsetof(StdRdOptions, user_catalog_table)},
+		{"wal_compress_update", RELOPT_TYPE_BOOL,
+		 offsetof(StdRdOptions, wal_compress_update)}
 	};
 
 	options = parseRelOptions(reloptions, validate, kind, &numoptions);
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index a771ccb..2724188 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -70,6 +70,7 @@
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
 #include "utils/tqual.h"
+#include "utils/pg_rbcompress.h"
 
 
 /* GUC variable */
@@ -6597,6 +6598,12 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	XLogRecPtr	recptr;
 	XLogRecData rdata[7];
 	Page		page = BufferGetPage(newbuf);
+	char	   *newtupdata;
+	int			newtuplen;
+	bool		compressed = false;
+
+	/* Structure which holds EWT */
+	char		buf[MaxHeapTupleSize];
 	bool		need_tuple_data = RelationIsLogicallyLogged(reln);
 
 	/* Caller should not call me on a non-WAL-logged relation */
@@ -6607,6 +6614,37 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	else
 		info = XLOG_HEAP_UPDATE;
 
+	newtupdata = ((char *) newtup->t_data) + offsetof(HeapTupleHeaderData, t_bits);
+	newtuplen = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits);
+
+	/*
+	 * EWT can be generated for all new tuple versions created by Update
+	 * operation. Currently we do it when both the old and new tuple versions
+	 * are on same page, because during recovery if the page containing old
+	 * tuple is corrupt, it should not cascade that corruption to other pages.
+	 * Under the general assumption that for long runs most updates tend to
+	 * create new tuple version on same page, there should not be significant
+	 * impact on WAL reduction or performance.
+	 *
+	 * We should not generate EWT when we need to backup the whole block in
+	 * WAL as in that case there is no saving by reduced WAL size.
+	 */
+
+	if (RelationIsEnabledForWalCompression(reln) &&
+		(oldbuf == newbuf) &&
+		!XLogCheckBufferNeedsBackup(newbuf))
+	{
+		uint32		enclen;
+
+		/* Delta-encode the new tuple using the old tuple */
+		if (heap_delta_encode(reln->rd_att, oldtup, newtup, buf, &enclen))
+		{
+			compressed = true;
+			newtupdata = buf;
+			newtuplen = enclen;
+		}
+	}
+
 	xlrec.target.node = reln->rd_node;
 	xlrec.target.tid = oldtup->t_self;
 	xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
@@ -6619,6 +6657,8 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	xlrec.newtid = newtup->t_self;
 	if (new_all_visible_cleared)
 		xlrec.flags |= XLOG_HEAP_NEW_ALL_VISIBLE_CLEARED;
+	if (compressed)
+		xlrec.flags |= XLOG_HEAP_DELTA_ENCODED;
 
 	rdata[0].data = (char *) &xlrec;
 	rdata[0].len = SizeOfHeapUpdate;
@@ -6634,7 +6674,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	xlhdr.header.t_infomask2 = newtup->t_data->t_infomask2;
 	xlhdr.header.t_infomask = newtup->t_data->t_infomask;
 	xlhdr.header.t_hoff = newtup->t_data->t_hoff;
-	xlhdr.t_len = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits);
+	xlhdr.t_len = newtuplen;
 
 	/*
 	 * As with insert records, we need not store the rdata[2] segment
@@ -6647,10 +6687,13 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	rdata[2].buffer_std = true;
 	rdata[2].next = &(rdata[3]);
 
-	/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
-	rdata[3].data = (char *) newtup->t_data
-		+ offsetof(HeapTupleHeaderData, t_bits);
-	rdata[3].len = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits);
+	/*
+	 * PG73FORMAT: write bitmap [+ padding] [+ oid] + data OR
+	 * PG94FORMAT [If encoded]: Control byte + history reference (2 - 3)bytes
+	 *							+ literal byte + ...
+	 */
+	rdata[3].data = newtupdata;
+	rdata[3].len = newtuplen;
 	rdata[3].buffer = need_tuple_data ? InvalidBuffer : newbuf;
 	rdata[3].buffer_std = true;
 	rdata[3].next = NULL;
@@ -7739,7 +7782,10 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update)
 	Page		page;
 	OffsetNumber offnum;
 	ItemId		lp = NULL;
+	HeapTupleData newtup;
+	HeapTupleData oldtup;
 	HeapTupleHeader htup;
+	HeapTupleHeader oldtupdata = NULL;
 	struct
 	{
 		HeapTupleHeaderData hdr;
@@ -7814,7 +7860,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update)
 	if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
 		elog(PANIC, "heap_update_redo: invalid lp");
 
-	htup = (HeapTupleHeader) PageGetItem(page, lp);
+	oldtupdata = htup = (HeapTupleHeader) PageGetItem(page, lp);
 
 	htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
 	htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
@@ -7923,10 +7969,31 @@ newsame:;
 	Assert(newlen <= MaxHeapTupleSize);
 	htup = &tbuf.hdr;
 	MemSet((char *) htup, 0, sizeof(HeapTupleHeaderData));
-	/* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
-	memcpy((char *) htup + offsetof(HeapTupleHeaderData, t_bits),
-		   (char *) xlrec + hsize,
-		   newlen);
+
+	/*
+	 * If the record is EWT then decode it.
+	 */
+	if (xlrec->flags & XLOG_HEAP_DELTA_ENCODED)
+	{
+		/*
+		 * PG94FORMAT: Control byte + history reference (2 - 3)bytes
+		 * + literal byte + ...
+		 */
+		oldtup.t_data = oldtupdata;
+		oldtup.t_len = ItemIdGetLength(lp);
+		newtup.t_data = htup;
+
+		heap_delta_decode((char *) xlrec + hsize, newlen, &oldtup, &newtup);
+		newlen = newtup.t_len;
+	}
+	else
+	{
+		/* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
+		memcpy((char *) htup + offsetof(HeapTupleHeaderData, t_bits),
+			   (char *) xlrec + hsize,
+			   newlen);
+	}
+
 	newlen += offsetof(HeapTupleHeaderData, t_bits);
 	htup->t_infomask2 = xlhdr.header.t_infomask2;
 	htup->t_infomask = xlhdr.header.t_infomask;
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index b333d82..92c4f00 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2327,6 +2327,28 @@ XLogRecPtrToBytePos(XLogRecPtr ptr)
 }
 
 /*
+ * Determine whether the buffer referenced has to be backed up. Since we don't
+ * yet have the insert lock, fullPageWrites and forcePageWrites could change
+ * later, but will not cause any problem because this function is used only to
+ * identify whether EWT is required for update.
+ */
+bool
+XLogCheckBufferNeedsBackup(Buffer buffer)
+{
+	bool		doPageWrites;
+	Page		page;
+
+	page = BufferGetPage(buffer);
+
+	doPageWrites = XLogCtl->Insert.fullPageWrites || XLogCtl->Insert.forcePageWrites;
+
+	if (doPageWrites && PageGetLSN(page) <= RedoRecPtr)
+		return true;			/* buffer requires backup */
+
+	return false;				/* buffer does not need to be backed up */
+}
+
+/*
  * Determine whether the buffer referenced by an XLogRecData item has to
  * be backed up, and if so fill a BkpBlock struct for it.  In any case
  * save the buffer's LSN at *lsn.
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile
index 1ae9fa0..04dc17c 100644
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -26,7 +26,7 @@ OBJS = acl.o arrayfuncs.o array_selfuncs.o array_typanalyze.o \
 	rowtypes.o regexp.o regproc.o ruleutils.o selfuncs.o \
 	tid.o timestamp.o varbit.o varchar.o varlena.o version.o xid.o \
 	network.o mac.o inet_cidr_ntop.o inet_net_pton.o \
-	ri_triggers.o pg_lzcompress.o pg_locale.o formatting.o \
+	ri_triggers.o pg_lzcompress.o pg_rbcompress.o pg_locale.o formatting.o \
 	ascii.o quote.o pgstatfuncs.o encode.o dbsize.o genfile.o trigfuncs.o \
 	tsginidx.o tsgistidx.o tsquery.o tsquery_cleanup.o tsquery_gist.o \
 	tsquery_op.o tsquery_rewrite.o tsquery_util.o tsrank.o \
diff --git a/src/backend/utils/adt/pg_rbcompress.c b/src/backend/utils/adt/pg_rbcompress.c
new file mode 100644
index 0000000..bcebed2
--- /dev/null
+++ b/src/backend/utils/adt/pg_rbcompress.c
@@ -0,0 +1,353 @@
+/* ----------
+ * pg_rbcompress.c -
+ *
+ *		This is a delta encoding scheme specific to PostgreSQL and designed
+ *		to compress similar tuples. It can be used as it is or extended for
+ *		other purpose in PostgrSQL if required.
+ *
+ *		Currently, this just checks for a common prefix and/or suffix, but
+ *		the output format is similar to the LZ format used in pg_lzcompress.c.
+ *
+ * Copyright (c) 1999-2014, PostgreSQL Global Development Group
+ *
+ * src/backend/utils/adt/pg_rbcompress.c
+ * ----------
+ */
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "utils/pg_rbcompress.h"
+
+
+/* ----------
+ * Local definitions
+ * ----------
+ */
+#define PGRB_HISTORY_SIZE		4096
+#define PGRB_MIN_MATCH			4
+
+
+
+/* ----------
+ * The provided standard strategies
+ * ----------
+ */
+static const PGRB_Strategy strategy_default_data = {
+	32,							/* Data chunks less than 32 bytes are not
+								 * compressed */
+	INT_MAX,					/* No upper limit on what we'll try to
+								 * compress */
+	25,							/* Require 25% compression rate, or not worth
+								 * it */
+};
+const PGRB_Strategy *const PGRB_strategy_default = &strategy_default_data;
+
+
+/* ----------
+ * pgrb_out_ctrl -
+ *
+ *		Outputs the last and allocates a new control byte if needed.
+ * ----------
+ */
+#define pgrb_out_ctrl(__ctrlp,__ctrlb,__ctrl,__buf) \
+do { \
+	if ((__ctrl & 0xff) == 0)												\
+	{																		\
+		*(__ctrlp) = __ctrlb;												\
+		__ctrlp = (__buf)++;												\
+		__ctrlb = 0;														\
+		__ctrl = 1;															\
+	}																		\
+} while (0)
+
+
+/* ----------
+ * pgrb_out_literal -
+ *
+ *		Outputs a literal byte to the destination buffer including the
+ *		appropriate control bit.
+ * ----------
+ */
+#define pgrb_out_literal(_ctrlp,_ctrlb,_ctrl,_buf,_byte) \
+do { \
+	pgrb_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf);								\
+	*(_buf)++ = (unsigned char)(_byte);										\
+	_ctrl <<= 1;															\
+} while (0)
+
+
+/* ----------
+ * pgrb_out_tag -
+ *
+ *		Outputs a backward reference tag of 2-4 bytes (depending on
+ *		offset and length) to the destination buffer including the
+ *		appropriate control bit.
+ * ----------
+ */
+#define pgrb_out_tag(_ctrlp,_ctrlb,_ctrl,_buf,_len,_off) \
+do { \
+	pgrb_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf);								\
+	_ctrlb |= _ctrl;														\
+	_ctrl <<= 1;															\
+	if (_len > 17)															\
+	{																		\
+		(_buf)[0] = (unsigned char)((((_off) & 0xf00) >> 4) | 0x0f);		\
+		(_buf)[1] = (unsigned char)(((_off) & 0xff));						\
+		(_buf)[2] = (unsigned char)((_len) - 18);							\
+		(_buf) += 3;														\
+	} else {																\
+		(_buf)[0] = (unsigned char)((((_off) & 0xf00) >> 4) | ((_len) - 3)); \
+		(_buf)[1] = (unsigned char)((_off) & 0xff);							\
+		(_buf) += 2;														\
+	}																		\
+} while (0)
+
+/* ----------
+ * pgrb_delta_encode - find common prefix/suffix between inputs and encode.
+ *
+ *	source is the input data to be compressed
+ *	slen is the length of source data
+ *  history is the data which is used as reference for compression
+ *	hlen is the length of history data
+ *	The encoded result is written to dest, and its length is returned in
+ *	finallen.
+ *	The return value is TRUE if compression succeeded,
+ *	FALSE if not; in the latter case the contents of dest
+ *	are undefined.
+ *	----------
+ */
+bool
+pgrb_delta_encode(const char *source, int32 slen,
+				  const char *history, int32 hlen,
+				  char *dest, uint32 *finallen,
+				  const PGRB_Strategy *strategy)
+{
+	unsigned char *bp = ((unsigned char *) dest);
+	unsigned char *bstart = bp;
+	const char *dp = source;
+	const char *dend = source + slen;
+	const char *hp = history;
+	unsigned char ctrl_dummy = 0;
+	unsigned char *ctrlp = &ctrl_dummy;
+	unsigned char ctrlb = 0;
+	unsigned char ctrl = 0;
+	int32		result_size;
+	int32		result_max;
+	int32		need_rate;
+	int			prefixlen;
+	int			suffixlen;
+
+	/*
+	 * Tuples of length greater than PGRB_HISTORY_SIZE are not allowed for
+	 * delta encode as this is the maximum size of history offset.
+	 * XXX: still true?
+	 */
+	if (hlen >= PGRB_HISTORY_SIZE || hlen < PGRB_MIN_MATCH)
+		return false;
+
+	/*
+	 * Our fallback strategy is the default.
+	 */
+	if (strategy == NULL)
+		strategy = PGRB_strategy_default;
+
+	/*
+	 * If the strategy forbids compression (at all or if source chunk size out
+	 * of range), fail.
+	 */
+	if (slen < strategy->min_input_size ||
+		slen > strategy->max_input_size)
+		return false;
+
+	need_rate = strategy->min_comp_rate;
+	if (need_rate < 0)
+		need_rate = 0;
+	else if (need_rate > 99)
+		need_rate = 99;
+
+	/*
+	 * Compute the maximum result size allowed by the strategy, namely the
+	 * input size minus the minimum wanted compression rate.  This had better
+	 * be <= slen, else we might overrun the provided output buffer.
+	 */
+	if (slen > (INT_MAX / 100))
+	{
+		/* Approximate to avoid overflow */
+		result_max = (slen / 100) * (100 - need_rate);
+	}
+	else
+	{
+		result_max = (slen * (100 - need_rate)) / 100;
+	}
+
+	for (prefixlen = 0; prefixlen < hlen && prefixlen < slen; prefixlen++)
+	{
+		if (history[prefixlen] != source[prefixlen])
+			break;
+	}
+	if (prefixlen < PGRB_MIN_MATCH)
+		prefixlen = 0;
+
+	hp = &history[hlen - 1];
+	dp = &source[slen - 1];
+	suffixlen = 0;
+	while (hp >= &history[prefixlen] && dp >= &source[prefixlen])
+	{
+		if (*hp != *dp)
+			break;
+		hp--;
+		dp--;
+		suffixlen++;
+	}
+	if (suffixlen < PGRB_MIN_MATCH)
+		suffixlen = 0;
+
+	/* FIXME: need to be more careful here, to make sure we don't
+	 * overflow the buffer!
+	 */
+	if (slen - prefixlen - suffixlen > (slen * need_rate) / 100)
+		return false;
+
+	/* Ok, this is worth delta encoding. */
+
+	/* output prefix as a tag */
+	pgrb_out_tag(ctrlp, ctrlb, ctrl, bp, prefixlen, hlen);
+
+	/* output bytes between prefix and suffix as literals */
+	dp = &source[prefixlen];
+	dend = &source[slen - suffixlen];
+	while (dp < dend)
+	{
+		pgrb_out_literal(ctrlp, ctrlb, ctrl, bp, *dp);
+		dp++;					/* Do not do this ++ in the line above! */
+	}
+
+	/* output suffix as a tag */
+	pgrb_out_tag(ctrlp, ctrlb, ctrl, bp, suffixlen, 0);
+
+	/*
+	 * Write out the last control byte and check that we haven't overrun the
+	 * output size allowed by the strategy.
+	 */
+	*ctrlp = ctrlb;
+	result_size = bp - bstart;
+
+	if (result_size > result_max)
+		return false;
+
+#ifdef DELTA_DEBUG
+	elog(LOG, "old %d new %d compressed %d", hlen, slen, result_size);
+#endif
+
+	/*
+	 * Success - need only fill in the actual length of the compressed datum.
+	 */
+	*finallen = result_size;
+
+	return true;
+}
+
+/* ----------
+ * pgrb_delta_decode
+ *
+ *		Decompresses source into dest.
+ * ----------
+ */
+void
+pgrb_delta_decode(const char *source, uint32 srclen,
+				  char *dest, uint32 destlen, uint32 *finallen,
+				  const char *history, uint32 histlen)
+{
+	const unsigned char *sp;
+	const unsigned char *srcend;
+	unsigned char *dp;
+	unsigned char *destend;
+	const char *hend;
+
+	sp = ((const unsigned char *) source);
+	srcend = ((const unsigned char *) source) + srclen;
+	dp = (unsigned char *) dest;
+	destend = dp + destlen;
+	hend = history + histlen;
+
+	while (sp < srcend && dp < destend)
+	{
+		/*
+		 * Read one control byte and process the next 8 items (or as many as
+		 * remain in the compressed input).
+		 */
+		unsigned char ctrl = *sp++;
+		int			ctrlc;
+
+		for (ctrlc = 0; ctrlc < 8 && sp < srcend; ctrlc++)
+		{
+			if (ctrl & 1)
+			{
+				/*
+				 * Otherwise it contains the match length minus 3 and the
+				 * upper 4 bits of the offset. The next following byte
+				 * contains the lower 8 bits of the offset. If the length is
+				 * coded as 18, another extension tag byte tells how much
+				 * longer the match really was (0-255).
+				 */
+				int32		len;
+				int32		off;
+
+				len = (sp[0] & 0x0f) + 3;
+				off = ((sp[0] & 0xf0) << 4) | sp[1];
+				sp += 2;
+				if (len == 18)
+					len += *sp++;
+
+				/*
+				 * Check for output buffer overrun, to ensure we don't clobber
+				 * memory in case of corrupt input.  Note: we must advance dp
+				 * here to ensure the error is detected below the loop.  We
+				 * don't simply put the elog inside the loop since that will
+				 * probably interfere with optimization.
+				 */
+				if (dp + len > destend)
+				{
+					dp += len;
+					break;
+				}
+
+				/*
+				 * Now we copy the bytes specified by the tag from history to
+				 * OUTPUT. We can safely use memcpy here because source and
+				 * destination strings will not overlap as in case of LZ.
+				 */
+				memcpy(dp, hend - off, len);
+				dp += len;
+			}
+			else
+			{
+				/*
+				 * An unset control bit means LITERAL BYTE. So we just copy
+				 * one from INPUT to OUTPUT.
+				 */
+				if (dp >= destend)		/* check for buffer overrun */
+					break;		/* do not clobber memory */
+
+				*dp++ = *sp++;
+			}
+
+			/*
+			 * Advance the control bit
+			 */
+			ctrl >>= 1;
+		}
+	}
+
+	/*
+	 * Check we decompressed the right amount.
+	 */
+	if (sp != srcend)
+		elog(PANIC, "compressed data is corrupt");
+
+	/*
+	 * That's it.
+	 */
+	*finallen = ((char *) dp - dest);
+}
diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h
index d4383ab..df64096 100644
--- a/src/include/access/heapam_xlog.h
+++ b/src/include/access/heapam_xlog.h
@@ -67,6 +67,7 @@
 #define XLOG_HEAP_CONTAINS_OLD_TUPLE		(1<<2)
 #define XLOG_HEAP_CONTAINS_OLD_KEY			(1<<3)
 #define XLOG_HEAP_CONTAINS_NEW_TUPLE		(1<<4)
+#define XLOG_HEAP_DELTA_ENCODED				(1<<5)
 
 /* convenience macro for checking whether any form of old tuple was logged */
 #define XLOG_HEAP_CONTAINS_OLD 						\
diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h
index a3eba98..abb5620 100644
--- a/src/include/access/htup_details.h
+++ b/src/include/access/htup_details.h
@@ -740,6 +740,11 @@ extern HeapTuple heap_modify_tuple(HeapTuple tuple,
 extern void heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc,
 				  Datum *values, bool *isnull);
 
+extern bool heap_delta_encode(TupleDesc tupleDesc, HeapTuple oldtup,
+				  HeapTuple newtup, char *encdata, uint32 *enclen);
+extern void heap_delta_decode (char *encdata, uint32 enclen, HeapTuple oldtup,
+				HeapTuple newtup);
+
 /* these three are deprecated versions of the three above: */
 extern HeapTuple heap_formtuple(TupleDesc tupleDescriptor,
 			   Datum *values, char *nulls);
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 47e3022..51d6925 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -279,6 +279,7 @@ typedef struct CheckpointStatsData
 extern CheckpointStatsData CheckpointStats;
 
 extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata);
+extern bool XLogCheckBufferNeedsBackup(Buffer buffer);
 extern void XLogFlush(XLogRecPtr RecPtr);
 extern bool XLogBackgroundFlush(void);
 extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
diff --git a/src/include/utils/pg_rbcompress.h b/src/include/utils/pg_rbcompress.h
new file mode 100644
index 0000000..effba23
--- /dev/null
+++ b/src/include/utils/pg_rbcompress.h
@@ -0,0 +1,58 @@
+/* ----------
+ * pg_rbcompress.h -
+ *
+ *	Definitions for the PostgreSQL specific encoding scheme
+ *
+ * src/include/utils/pg_rbcompress.h
+ * ----------
+ */
+
+#ifndef _PG_RBCOMPRESS_H_
+#define _PG_RBCOMPRESS_H_
+
+
+/* ----------
+ * PGRB_Strategy -
+ *
+ *		Some values that control the compression algorithm.
+ *
+ *		min_input_size		Minimum input data size to consider compression.
+ *
+ *		max_input_size		Maximum input data size to consider compression.
+ *
+ *		min_comp_rate		Minimum compression rate (0-99%) to require.
+ *							Regardless of min_comp_rate, the output must be
+ *							smaller than the input, else we don't store
+ *							compressed.
+ * ----------
+ */
+typedef struct PGRB_Strategy
+{
+	int32		min_input_size;
+	int32		max_input_size;
+	int32		min_comp_rate;
+} PGRB_Strategy;
+
+
+/* ----------
+ * The standard strategies
+ *
+ *		PGRB_strategy_default		Recommended default strategy for WAL
+ *									compression.
+ * ----------
+ */
+extern const PGRB_Strategy *const PGRB_strategy_default;
+
+
+/* ----------
+ * Global function declarations
+ * ----------
+ */
+extern bool pgrb_delta_encode(const char *source, int32 slen,
+				  const char *history, int32 hlen,
+				  char *dest, uint32 *finallen, const PGRB_Strategy *strategy);
+extern void pgrb_delta_decode(const char *source, uint32 srclen,
+							  char *dest, uint32 destlen, uint32 *finallen,
+							  const char *history, uint32 histlen);
+
+#endif   /* _PG_RBCOMPRESS_H_ */
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 9b8a4c9..717b90b 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -218,6 +218,7 @@ typedef struct StdRdOptions
 	bool		security_barrier;		/* for views */
 	int			check_option_offset;	/* for views */
 	bool		user_catalog_table;		/* use as an additional catalog relation */
+	bool		wal_compress_update;	/* compress wal tuple for update */
 } StdRdOptions;
 
 #define HEAP_MIN_FILLFACTOR			10
@@ -296,6 +297,15 @@ typedef struct StdRdOptions
 	 ((StdRdOptions *) (relation)->rd_options)->user_catalog_table : false)
 
 /*
+ * RelationIsEnabledForWalCompression
+ *		Returns whether the wal for update operation on relation can
+ *      be compressed.
+ */
+#define RelationIsEnabledForWalCompression(relation)	\
+	((relation)->rd_options ?				\
+	 ((StdRdOptions *) (relation)->rd_options)->wal_compress_update : true)
+
+/*
  * RelationIsValid
  *		True iff relation descriptor is valid.
  */
diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out
index 71b856f..af46df2 100644
--- a/src/test/regress/expected/update.out
+++ b/src/test/regress/expected/update.out
@@ -97,3 +97,73 @@ SELECT a, b, char_length(c) FROM update_test;
 (2 rows)
 
 DROP TABLE update_test;
+--
+-- Test to update continuos and non continuos columns
+--
+DROP TABLE IF EXISTS update_test;
+NOTICE:  table "update_test" does not exist, skipping
+CREATE TABLE update_test (
+		bser bigserial,
+		bln boolean,
+		ename VARCHAR(25),
+		perf_f float(8),
+		grade CHAR,
+		dept CHAR(5) NOT NULL,
+		dob DATE,
+		idnum INT,
+		addr VARCHAR(30) NOT NULL,
+		destn CHAR(6),
+		Gend CHAR,
+		samba BIGINT,
+		hgt float,
+		ctime TIME
+);
+INSERT INTO update_test VALUES (
+		nextval('update_test_bser_seq'::regclass),
+		TRUE,
+		'Test',
+		7.169,
+		'B',
+		'CSD',
+		'2000-01-01',
+		520,
+		'road2,
+		streeeeet2,
+		city2',
+		'dcy2',
+		'M',
+		12000,
+		50.4,
+		'00:00:00.0'
+);
+SELECT * from update_test;
+ bser | bln | ename | perf_f | grade | dept  |    dob     | idnum |            addr             | destn  | gend | samba | hgt  |  ctime   
+------+-----+-------+--------+-------+-------+------------+-------+-----------------------------+--------+------+-------+------+----------
+    1 | t   | Test  |  7.169 | B     | CSD   | 01-01-2000 |   520 | road2,                     +| dcy2   | M    | 12000 | 50.4 | 00:00:00
+      |     |       |        |       |       |            |       |                 streeeeet2,+|        |      |       |      | 
+      |     |       |        |       |       |            |       |                 city2       |        |      |       |      | 
+(1 row)
+
+-- update first column
+UPDATE update_test SET bser = bser - 1 + 1;
+-- update middle column
+UPDATE update_test SET perf_f = 8.9;
+-- update last column
+UPDATE update_test SET ctime = '00:00:00.1';
+-- update 3 continuos columns
+UPDATE update_test SET destn = 'dcy2', samba = 0 WHERE Gend = 'M' and dept = 'CSD';
+-- update two non continuos columns
+UPDATE update_test SET destn = 'moved', samba = 0;
+UPDATE update_test SET bln = FALSE, hgt = 10.1;
+-- update causing some column alignment difference
+UPDATE update_test SET ename = 'Tes';
+UPDATE update_test SET dept = 'Test';
+SELECT * from update_test;
+ bser | bln | ename | perf_f | grade | dept  |    dob     | idnum |            addr             | destn  | gend | samba | hgt  |   ctime    
+------+-----+-------+--------+-------+-------+------------+-------+-----------------------------+--------+------+-------+------+------------
+    1 | f   | Tes   |    8.9 | B     | Test  | 01-01-2000 |   520 | road2,                     +| moved  | M    |     0 | 10.1 | 00:00:00.1
+      |     |       |        |       |       |            |       |                 streeeeet2,+|        |      |       |      | 
+      |     |       |        |       |       |            |       |                 city2       |        |      |       |      | 
+(1 row)
+
+DROP TABLE update_test;
diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql
index a8a028f..1806992 100644
--- a/src/test/regress/sql/update.sql
+++ b/src/test/regress/sql/update.sql
@@ -59,3 +59,70 @@ UPDATE update_test SET c = repeat('x', 10000) WHERE c = 'car';
 SELECT a, b, char_length(c) FROM update_test;
 
 DROP TABLE update_test;
+
+
+--
+-- Test to update continuos and non continuos columns
+--
+
+DROP TABLE IF EXISTS update_test;
+CREATE TABLE update_test (
+		bser bigserial,
+		bln boolean,
+		ename VARCHAR(25),
+		perf_f float(8),
+		grade CHAR,
+		dept CHAR(5) NOT NULL,
+		dob DATE,
+		idnum INT,
+		addr VARCHAR(30) NOT NULL,
+		destn CHAR(6),
+		Gend CHAR,
+		samba BIGINT,
+		hgt float,
+		ctime TIME
+);
+
+INSERT INTO update_test VALUES (
+		nextval('update_test_bser_seq'::regclass),
+		TRUE,
+		'Test',
+		7.169,
+		'B',
+		'CSD',
+		'2000-01-01',
+		520,
+		'road2,
+		streeeeet2,
+		city2',
+		'dcy2',
+		'M',
+		12000,
+		50.4,
+		'00:00:00.0'
+);
+
+SELECT * from update_test;
+
+-- update first column
+UPDATE update_test SET bser = bser - 1 + 1;
+
+-- update middle column
+UPDATE update_test SET perf_f = 8.9;
+
+-- update last column
+UPDATE update_test SET ctime = '00:00:00.1';
+
+-- update 3 continuos columns
+UPDATE update_test SET destn = 'dcy2', samba = 0 WHERE Gend = 'M' and dept = 'CSD';
+
+-- update two non continuos columns
+UPDATE update_test SET destn = 'moved', samba = 0;
+UPDATE update_test SET bln = FALSE, hgt = 10.1;
+
+-- update causing some column alignment difference
+UPDATE update_test SET ename = 'Tes';
+UPDATE update_test SET dept = 'Test';
+
+SELECT * from update_test;
+DROP TABLE update_test;

-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Performance Improvement by reducing WAL for Update Operation

Reply via email to