I came up with the attached patch, to reduce the WAL volume of GIN insertions. It become fairly large, but I guess that's not too surprising as the old WAL-logging method was basically to dump the whole page to WAL record. This is now a lot more fine-grained and smarter. I separated constructing the WAL record from copying the changes back to the disk page, which IMHO is a readability improvement even though it's more code.

There are two parts to this patch:

* leafRepackItems has been rewritten. The previous coding basically searched for the first modified item, and decoded and re-encoded everything on the page that after that. Now it tries harder to avoid re-encoding segments that are still reasonably sized (between 128 and 384 bytes, with the target for new segments being 256 bytes). This ought to make random updates faster as a bonus, but I didn't performance test that.

* Track more carefully which segments on the page have been modified. The in-memory structure used to manipulate a page now keeps an action code for each segment, indicating if the segment is completely new, deleted, or replaced with new content, or if just some new items have been added to it. These same actions are WAL-logged, and replayed in the redo routine.

This brings the WAL volume back to the same ballpark as 9.3. Or better, depending on the operation.

Fujii, Alexander, how does this look to you?

- Heikki
diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c
index 313d53c..56b456e 100644
--- a/src/backend/access/gin/gindatapage.c
+++ b/src/backend/access/gin/gindatapage.c
@@ -22,17 +22,17 @@
 #include "utils/rel.h"
 
 /*
- * Size of the posting lists stored on leaf pages, in bytes. The code can
- * deal with any size, but random access is more efficient when a number of
- * smaller lists are stored, rather than one big list.
- */
-#define GinPostingListSegmentMaxSize 256
-
-/*
- * Existing posting lists smaller than this are recompressed, when inserting
- * new items to page.
+ * Min, Max and Target size of posting lists stored on leaf pages, in bytes.
+ *
+ * The code can deal with any size, but random access is more efficient when
+ * a number of smaller lists are stored, rather than one big list. If a
+ * posting list would become larger than Max size as a result of insertions,
+ * it is split into two. If a posting list would be smaller than minimum
+ * size
  */
-#define GinPostingListSegmentMinSize 192
+#define GinPostingListSegmentMaxSize 384
+#define GinPostingListSegmentTargetSize 256
+#define GinPostingListSegmentMinSize 128
 
 /*
  * At least this many items fit in a GinPostingListSegmentMaxSize-bytes
@@ -55,12 +55,32 @@ typedef struct
 	dlist_node *lastleft;		/* last segment on left page */
 	int			lsize;			/* total size on left page */
 	int			rsize;			/* total size on right page */
+
+	bool		oldformat;		/* page is in pre-9.4 on disk */
 } disassembledLeaf;
 
 typedef struct
 {
 	dlist_node	node;		/* linked list pointers */
 
+	/*-------------
+	 * 'action' indicates the status of this in-memory segment, compared to
+	 * what's on disk. It is one of the GIN_SEGMENT_* action codes:
+	 *
+	 * UNMODIFIED	no changes
+	 * DELETED		the segment is to be removed. 'seg' and 'items' are
+	 *				ignored
+	 * INSERT		this is a completely new segment
+	 * REPLACE		this replaces an existing segment with new content
+	 * ADDITEMS		like REPLACE, but we track in detail what items have
+	 *				been added to this segment, in 'modifieditems'
+	 *-------------
+	 */
+	char		action;
+
+	ItemPointerData *modifieditems;
+	int			nmodifieditems;
+
 	/*
 	 * The following fields represent the items in this segment.
 	 * If 'items' is not NULL, it contains a palloc'd array of the items
@@ -72,8 +92,6 @@ typedef struct
 	GinPostingList *seg;
 	ItemPointer items;
 	int			nitems;			/* # of items in 'items', if items != NULL */
-
-	bool		modified;		/* is this segment on page already? */
 } leafSegmentInfo;
 
 static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems);
@@ -87,9 +105,9 @@ static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining);
 static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems);
 
 
-static void dataPlaceToPageLeafRecompress(Buffer buf,
-							  disassembledLeaf *leaf,
-							  XLogRecData **prdata);
+static XLogRecData *constructLeafRecompressWALData(Buffer buf,
+							   disassembledLeaf *leaf);
+static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf);
 static void dataPlaceToPageLeafSplit(Buffer buf,
 						 disassembledLeaf *leaf,
 						 ItemPointerData lbound, ItemPointerData rbound,
@@ -563,15 +581,21 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
 	if (!needsplit)
 	{
 		/*
-		 * Great, all the items fit on a single page. Write the segments to
-		 * the page, and WAL-log appropriately.
+		 * Great, all the items fit on a single page. Construct a WAL record
+		 * describing the changes we made, and write the segments back to
+		 * the page.
 		 *
 		 * Once we start modifying the page, there's no turning back. The
 		 * caller is responsible for calling END_CRIT_SECTION() after writing
 		 * the WAL record.
 		 */
+		MemoryContextSwitchTo(oldCxt);
+		if (RelationNeedsWAL(btree->index))
+			*prdata = constructLeafRecompressWALData(buf, leaf);
+		else
+			*prdata = NULL;
 		START_CRIT_SECTION();
-		dataPlaceToPageLeafRecompress(buf, leaf, prdata);
+		dataPlaceToPageLeafRecompress(buf, leaf);
 
 		if (append)
 			elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)",
@@ -619,8 +643,6 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
 						break;
 				}
 
-				 /* don't consider segments moved to right as unmodified */
-				lastleftinfo->modified = true;
 				leaf->lsize -= segsize;
 				leaf->rsize += segsize;
 				leaf->lastleft = dlist_prev_node(&leaf->segments, leaf->lastleft);
@@ -716,14 +738,15 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
 				/* Removing an item never increases the size of the segment */
 				if (npacked != ncleaned)
 					elog(ERROR, "could not fit vacuumed posting list");
+				seginfo->action = GIN_SEGMENT_REPLACE;
 			}
 			else
 			{
 				seginfo->seg = NULL;
 				seginfo->items = NULL;
+				seginfo->action = GIN_SEGMENT_DELETE;
 			}
 			seginfo->nitems = ncleaned;
-			seginfo->modified = true;
 
 			removedsomething = true;
 		}
@@ -748,10 +771,33 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
 	 */
 	if (removedsomething)
 	{
-		XLogRecData *payloadrdata;
+		XLogRecData *payloadrdata = NULL;
+		bool		modified;
 
+		/*
+		 * Make sure we have a palloc'd copy of all segments, after the
+		 * first segment that is modified. (dataPlaceToPageLeafRecompress
+		 * requires this).
+		 */
+		modified = false;
+		dlist_foreach(iter, &leaf->segments)
+		{
+			leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
+			if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
+				modified = true;
+			if (modified && seginfo->action != GIN_SEGMENT_DELETE)
+			{
+				int			segsize = SizeOfGinPostingList(seginfo->seg);
+				GinPostingList *tmp = (GinPostingList *) palloc(segsize);
+				memcpy(tmp, seginfo->seg, segsize);
+				seginfo->seg = tmp;
+			}
+		}
+
+		if (RelationNeedsWAL(indexrel))
+			payloadrdata = constructLeafRecompressWALData(buffer, leaf);
 		START_CRIT_SECTION();
-		dataPlaceToPageLeafRecompress(buffer, leaf, &payloadrdata);
+		dataPlaceToPageLeafRecompress(buffer, leaf);
 
 		MarkBufferDirty(buffer);
 
@@ -778,96 +824,168 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
 }
 
 /*
+ * Construct a ginxlogRecompressDataLeaf record representing the changes
+ * in *leaf.
+ */
+static XLogRecData *
+constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf)
+{
+	int			nmodified = 0;
+	char	   *walbufbegin;
+	char	   *walbufend;
+	XLogRecData *rdata;
+	dlist_iter	iter;
+	int			segno;
+	ginxlogRecompressDataLeaf *recompress_xlog;
+
+	/* Count the modified segments */
+	dlist_foreach(iter, &leaf->segments)
+	{
+		leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+												   iter.cur);
+
+		if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
+			nmodified++;
+	}
+
+	walbufbegin = palloc(
+		sizeof(ginxlogRecompressDataLeaf) +
+		BLCKSZ + 			/* max size needed to hold the segment data */
+		nmodified * 2 +		/* (segno + action) per action */
+		sizeof(XLogRecData));
+	walbufend = walbufbegin;
+
+	recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend;
+	walbufend += sizeof(ginxlogRecompressDataLeaf);
+
+	recompress_xlog->nactions = nmodified;
+
+	segno = 0;
+	dlist_foreach(iter, &leaf->segments)
+	{
+		leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
+		int			segsize = 0;
+		int			datalen;
+		uint8		action = seginfo->action;
+
+		if (action == GIN_SEGMENT_UNMODIFIED)
+		{
+			segno++;
+			continue;
+		}
+
+		if (action != GIN_SEGMENT_DELETE)
+			segsize = SizeOfGinPostingList(seginfo->seg);
+
+		/*
+		 * If storing the uncompressed list of added item pointers would take
+		 * more space than storing the compressed segment as is, do that
+		 * instead.
+		 */
+		if (action == GIN_SEGMENT_ADDITEMS &&
+			seginfo->nmodifieditems * sizeof(ItemPointerData) > segsize)
+		{
+			action = GIN_SEGMENT_REPLACE;
+		}
+
+		*((uint8 *) (walbufend++)) = segno;
+		*(walbufend++) = action;
+
+		switch (action)
+		{
+			case GIN_SEGMENT_DELETE:
+				datalen = 0;
+				break;
+
+			case GIN_SEGMENT_ADDITEMS:
+				datalen = seginfo->nmodifieditems * sizeof(ItemPointerData);
+				*((uint16 *) walbufend) = seginfo->nmodifieditems;
+				memcpy(walbufend + sizeof(uint16), seginfo->modifieditems, datalen);
+				datalen += sizeof(uint16);
+				break;
+
+			case GIN_SEGMENT_INSERT:
+			case GIN_SEGMENT_REPLACE:
+				datalen = SHORTALIGN(segsize);
+				memcpy(walbufend, seginfo->seg, segsize);
+				break;
+
+			default:
+				elog(ERROR, "unexpected GIN leaf action %d", action);
+		}
+		walbufend += datalen;
+
+		if (action != GIN_SEGMENT_INSERT)
+			segno++;
+	}
+
+	rdata = (XLogRecData *) MAXALIGN(walbufend);
+	rdata->buffer = buf;
+	rdata->buffer_std = TRUE;
+	rdata->data = walbufbegin;
+	rdata->len = walbufend - walbufbegin;
+	rdata->next = NULL;
+
+	return rdata;
+}
+
+/*
  * Assemble a disassembled posting tree leaf page back to a buffer.
  *
  * *prdata is filled with WAL information about this operation. The caller
  * is responsible for inserting to the WAL, along with any other information
  * about the operation that triggered this recompression.
  *
- * NOTE: The segment pointers can point directly to the same buffer, with
- * the limitation that any earlier segment must not overlap with an original,
- * later segment. In other words, some segments may point the original buffer
- * as long as you don't make any segments larger. Currently, leafRepackItems
- * satisies this rule because it rewrites all segments after the first
- * modified one, and vacuum can only make segments shorter.
+ * NOTE: The segment pointers must not point directly to the same buffer,
+ * except for segments that have not been modified, and none of the preceding
+ * segments have been modified either.
  */
 static void
-dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf,
-							  XLogRecData **prdata)
+dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf)
 {
 	Page		page = BufferGetPage(buf);
 	char	   *ptr;
 	int			newsize;
-	int			unmodifiedsize;
 	bool		modified = false;
 	dlist_iter	iter;
+	int			segsize;
+
 	/*
-	 * these must be static so they can be returned to caller (no pallocs
-	 * since we're in a critical section!)
+	 * If the page was in pre-9.4 format before, convert the header, and
+	 * force all segments to be copied to the page whether they were modified
+	 * or not.
 	 */
-	static ginxlogRecompressDataLeaf recompress_xlog;
-	static XLogRecData rdata[2];
+	if (!GinPageIsCompressed(page))
+	{
+		Assert(leaf->oldformat);
+		GinPageSetCompressed(page);
+		GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
+		modified = true;
+	}
 
 	ptr = (char *) GinDataLeafPageGetPostingList(page);
 	newsize = 0;
-	unmodifiedsize = 0;
-	modified = false;
 	dlist_foreach(iter, &leaf->segments)
 	{
 		leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
-		int			segsize;
 
-		if (seginfo->modified)
+		if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
 			modified = true;
 
-		/*
-		 * Nothing to do with empty segments, except keep track if they've been
-		 * modified
-		 */
-		if (seginfo->seg == NULL)
-		{
-			Assert(seginfo->items == NULL);
-			continue;
-		}
+		if (seginfo->action != GIN_SEGMENT_DELETE)
+			segsize = SizeOfGinPostingList(seginfo->seg);
+		else
+			segsize = 0;
 
-		segsize = SizeOfGinPostingList(seginfo->seg);
+		if (modified)
+			memcpy(ptr, seginfo->seg, segsize);
 
-		if (!modified)
-			unmodifiedsize += segsize;
-		else
-		{
-			/*
-			 * Use memmove rather than memcpy, in case the segment points
-			 * to the same buffer
-			 */
-			memmove(ptr, seginfo->seg, segsize);
-		}
 		ptr += segsize;
 		newsize += segsize;
 	}
+
 	Assert(newsize <= GinDataLeafMaxContentSize);
 	GinDataLeafPageSetPostingListSize(page, newsize);
-
-	/* Reset these in case the page was in pre-9.4 format before */
-	GinPageSetCompressed(page);
-	GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
-
-	/* Put WAL data */
-	recompress_xlog.length = (uint16) newsize;
-	recompress_xlog.unmodifiedsize = unmodifiedsize;
-
-	rdata[0].buffer = InvalidBuffer;
-	rdata[0].data = (char *) &recompress_xlog;
-	rdata[0].len = offsetof(ginxlogRecompressDataLeaf, newdata);
-	rdata[0].next = &rdata[1];
-
-	rdata[1].buffer = buf;
-	rdata[1].buffer_std = TRUE;
-	rdata[1].data = ((char *) GinDataLeafPageGetPostingList(page)) + unmodifiedsize;
-	rdata[1].len = newsize - unmodifiedsize;
-	rdata[1].next = NULL;
-
-	*prdata = rdata;
 }
 
 /*
@@ -914,9 +1032,12 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf,
 		seginfo = dlist_container(leafSegmentInfo, node, node);
 		segsize = SizeOfGinPostingList(seginfo->seg);
 
-		memcpy(ptr, seginfo->seg, segsize);
-		ptr += segsize;
-		lsize += segsize;
+		if (seginfo->action != GIN_SEGMENT_DELETE)
+		{
+			memcpy(ptr, seginfo->seg, segsize);
+			ptr += segsize;
+			lsize += segsize;
+		}
 	}
 	Assert(lsize == leaf->lsize);
 	GinDataLeafPageSetPostingListSize(lpage, lsize);
@@ -932,9 +1053,12 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf,
 		seginfo = dlist_container(leafSegmentInfo, node, node);
 		segsize = SizeOfGinPostingList(seginfo->seg);
 
-		memcpy(ptr, seginfo->seg, segsize);
-		ptr += segsize;
-		rsize += segsize;
+		if (seginfo->action != GIN_SEGMENT_DELETE)
+		{
+			memcpy(ptr, seginfo->seg, segsize);
+			ptr += segsize;
+			rsize += segsize;
+		}
 
 		if (!dlist_has_next(&leaf->segments, node))
 			break;
@@ -1195,14 +1319,15 @@ disassembleLeaf(Page page)
 		{
 			leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo));
 
+			seginfo->action = GIN_SEGMENT_UNMODIFIED;
 			seginfo->seg = seg;
 			seginfo->items = NULL;
 			seginfo->nitems = 0;
-			seginfo->modified = false;
 			dlist_push_tail(&leaf->segments, &seginfo->node);
 
 			seg = GinNextPostingListSegment(seg);
 		}
+		leaf->oldformat = false;
 	}
 	else
 	{
@@ -1218,14 +1343,15 @@ disassembleLeaf(Page page)
 
 		seginfo = palloc(sizeof(leafSegmentInfo));
 
+		seginfo->action = GIN_SEGMENT_REPLACE;
 		seginfo->seg = NULL;
 		seginfo->items = palloc(nuncompressed * sizeof(ItemPointerData));
 		memcpy(seginfo->items, uncompressed, nuncompressed * sizeof(ItemPointerData));
 		seginfo->nitems = nuncompressed;
-		/* make sure we rewrite this to disk */
-		seginfo->modified = true;
 
 		dlist_push_tail(&leaf->segments, &seginfo->node);
+
+		leaf->oldformat = true;
 	}
 
 	return leaf;
@@ -1247,6 +1373,7 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
 	ItemPointer nextnew = newItems;
 	int			newleft = nNewItems;
 	bool		modified = false;
+	leafSegmentInfo *newseg;
 
 	/*
 	 * If the page is completely empty, just construct one new segment to
@@ -1254,14 +1381,12 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
 	 */
 	if (dlist_is_empty(&leaf->segments))
 	{
-		/* create a new segment for the new entries */
-		leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo));
-
-		seginfo->seg = NULL;
-		seginfo->items = newItems;
-		seginfo->nitems = nNewItems;
-		seginfo->modified = true;
-		dlist_push_tail(&leaf->segments, &seginfo->node);
+		newseg = palloc(sizeof(leafSegmentInfo));
+		newseg->seg = NULL;
+		newseg->items = newItems;
+		newseg->nitems = nNewItems;
+		newseg->action = GIN_SEGMENT_INSERT;
+		dlist_push_tail(&leaf->segments, &newseg->node);
 		return true;
 	}
 
@@ -1303,15 +1428,51 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
 		if (!cur->items)
 			cur->items = ginPostingListDecode(cur->seg, &cur->nitems);
 
+		/*
+		 * Fast path for the important special case that we're appending to
+		 * the end of the page: don't let the last segment on the page grow
+		 * larger than the target, create a new segment before that happens.
+		 */
+		if (!dlist_has_next(&leaf->segments, iter.cur) &&
+			ginCompareItemPointers(&cur->items[cur->nitems - 1], &nextnew[0]) > 0 &&
+			cur->seg != NULL &&
+			SizeOfGinPostingList(cur->seg) >= GinPostingListSegmentTargetSize)
+		{
+			newseg = palloc(sizeof(leafSegmentInfo));
+			newseg->seg = NULL;
+			newseg->items = nextnew;
+			newseg->nitems = nthis;
+			newseg->action = GIN_SEGMENT_INSERT;
+			dlist_push_tail(&leaf->segments, &newseg->node);
+			modified = true;
+			break;
+		}
+
 		tmpitems = ginMergeItemPointers(cur->items, cur->nitems,
 										nextnew, nthis,
 										&ntmpitems);
 		if (ntmpitems != cur->nitems)
 		{
+			/*
+			 * If there are no duplicates, track the added items so that we
+			 * can emit a compact ADDITEMS WAL record later on. (it doesn't
+			 * seem worth re-checking which items were duplicates, if there
+			 * were any)
+			 */
+			if (ntmpitems == nthis + cur->nitems &&
+				cur->action == GIN_SEGMENT_UNMODIFIED)
+			{
+				cur->action = GIN_SEGMENT_ADDITEMS;
+				cur->modifieditems = nextnew;
+				cur->nmodifieditems = nthis;
+			}
+			else
+				cur->action = GIN_SEGMENT_REPLACE;
+
 			cur->items = tmpitems;
 			cur->nitems = ntmpitems;
 			cur->seg = NULL;
-			modified = cur->modified = true;
+			modified = true;
 		}
 
 		nextnew += nthis;
@@ -1331,133 +1492,160 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
  * If all items fit, *remaining is set to invalid.
  *
  * Returns true if the page has to be split.
- *
- * XXX: Actually, this re-encodes all segments after the first one that was
- * modified, to make sure the new segments are all more or less of equal
- * length. That's unnecessarily aggressive; if we've only added a single item
- * to one segment, for example, we could re-encode just that single segment,
- * as long as it's still smaller than, say, 2x the normal segment size.
  */
 static bool
 leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining)
 {
-	dlist_iter	iter;
-	ItemPointer allitems;
-	int			nallitems;
 	int			pgused = 0;
-	bool		needsplit;
-	int			totalpacked;
-	dlist_mutable_iter miter;
-	dlist_head	recode_list;
-	int			nrecode;
-	bool		recoding;
+	bool		needsplit = false;
+	dlist_iter	iter;
+	int			segsize;
+	leafSegmentInfo *nextseg;
+	int			npacked;
+	bool		modified;
+	dlist_node *cur_node;
+	dlist_node *next_node;
 
 	ItemPointerSetInvalid(remaining);
 
 	/*
-	 * Find the first segment that needs to be re-coded. Move all segments
-	 * that need recoding to separate list, and count the total number of
-	 * items in them. Also, add up the number of bytes in unmodified segments
-	 * (pgused).
+	 * cannot use dlist_foreach_modify here because we insert adjacent
+	 * items while iterating.
 	 */
-	dlist_init(&recode_list);
-	recoding = false;
-	nrecode = 0;
-	pgused = 0;
-	dlist_foreach_modify(miter, &leaf->segments)
+	for (cur_node = dlist_head_node(&leaf->segments);
+		 cur_node != NULL;
+		 cur_node = next_node)
 	{
-		leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, miter.cur);
+		leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, cur_node);
 
-		/* If the segment was modified, re-encode it */
-		if (seginfo->modified || seginfo->seg == NULL)
-			recoding = true;
-		/*
-		 * Also re-encode abnormally small or large segments. (Vacuum can
-		 * leave behind small segments, and conversion from pre-9.4 format
-		 * can leave behind large segments).
-		 */
-		else if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize)
-			recoding = true;
-		else if (SizeOfGinPostingList(seginfo->seg) > GinPostingListSegmentMaxSize)
-			recoding = true;
+		if (dlist_has_next(&leaf->segments, cur_node))
+			next_node = dlist_next_node(&leaf->segments, cur_node);
+		else
+			next_node = NULL;
 
-		if (recoding)
+		/* Compress the posting list, if necessary */
+		if (seginfo->action != GIN_SEGMENT_DELETE)
 		{
-			if (!seginfo->items)
-				seginfo->items = ginPostingListDecode(seginfo->seg,
-													  &seginfo->nitems);
-			nrecode += seginfo->nitems;
-
-			dlist_delete(miter.cur);
-			dlist_push_tail(&recode_list, &seginfo->node);
-		}
-		else
-			pgused += SizeOfGinPostingList(seginfo->seg);
-	}
+			if (seginfo->seg == NULL)
+			{
+				if (seginfo->nitems > GinPostingListSegmentMaxSize)
+					npacked = 0; /* no chance that it would fit. */
+				else
+				{
+					seginfo->seg = ginCompressPostingList(seginfo->items,
+														  seginfo->nitems,
+												   GinPostingListSegmentMaxSize,
+														  &npacked);
+				}
+				if (npacked != seginfo->nitems)
+				{
+					/*
+					 * Too large. Compress again to the target size, and create
+					 * a new segment to represent the remaining items. The new
+					 * segment is inserted after this one, so it will be
+					 * processed in the next iteration of this loop.
+					 */
+					if (seginfo->seg)
+						pfree(seginfo->seg);
+					seginfo->seg = ginCompressPostingList(seginfo->items,
+														  seginfo->nitems,
+											   GinPostingListSegmentTargetSize,
+														  &npacked);
+					if (seginfo->action != GIN_SEGMENT_INSERT)
+						seginfo->action = GIN_SEGMENT_REPLACE;
+
+					nextseg = palloc(sizeof(leafSegmentInfo));
+					nextseg->action = GIN_SEGMENT_INSERT;
+					nextseg->seg = NULL;
+					nextseg->items = &seginfo->items[npacked];
+					nextseg->nitems = seginfo->nitems - npacked;
+					next_node = &nextseg->node;
+					dlist_insert_after(cur_node, next_node);
+				}
+			}
 
-	if (nrecode == 0)
-		return false; /* nothing to do */
+			/*
+			 * If the segment is very small, merge it with the next
+			 * segment.
+			 */
+			if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize && next_node)
+			{
+				int		nmerged;
+
+				nextseg = dlist_container(leafSegmentInfo, node, next_node);
+
+				if (seginfo->items == NULL)
+					seginfo->items = ginPostingListDecode(seginfo->seg,
+														  &seginfo->nitems);
+				if (nextseg->items == NULL)
+					nextseg->items = ginPostingListDecode(nextseg->seg,
+														  &nextseg->nitems);
+				nextseg->items =
+					ginMergeItemPointers(seginfo->items, seginfo->nitems,
+										 nextseg->items, nextseg->nitems,
+										 &nmerged);
+				Assert(nmerged == seginfo->nitems + nextseg->nitems);
+				nextseg->nitems = nmerged;
+				nextseg->seg = NULL;
+
+				nextseg->action = GIN_SEGMENT_REPLACE;
+				nextseg->modifieditems = NULL;
+				nextseg->nmodifieditems = 0;
+
+				if (seginfo->action == GIN_SEGMENT_INSERT)
+				{
+					dlist_delete(cur_node);
+					continue;
+				}
+				else
+				{
+					seginfo->action = GIN_SEGMENT_DELETE;
+					seginfo->seg = NULL;
+				}
+			}
 
-	/*
-	 * Construct one big array of the items that need to be re-encoded.
-	 */
-	allitems = palloc(nrecode * sizeof(ItemPointerData));
-	nallitems = 0;
-	dlist_foreach(iter, &recode_list)
-	{
-		leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
-		memcpy(&allitems[nallitems], seginfo->items, seginfo->nitems * sizeof(ItemPointerData));
-		nallitems += seginfo->nitems;
-	}
-	Assert(nallitems == nrecode);
+			seginfo->items = NULL;
+			seginfo->nitems = 0;
+		}
 
-	/*
-	 * Start packing the items into segments. Stop when we have consumed
-	 * enough space to fill both pages, or we run out of items.
-	 */
-	totalpacked = 0;
-	needsplit = false;
-	while (totalpacked < nallitems)
-	{
-		leafSegmentInfo *seginfo;
-		int			npacked;
-		GinPostingList *seg;
-		int			segsize;
+		if (seginfo->action == GIN_SEGMENT_DELETE)
+			continue;
 
-		seg = ginCompressPostingList(&allitems[totalpacked],
-									 nallitems - totalpacked,
-									 GinPostingListSegmentMaxSize,
-									 &npacked);
-		segsize = SizeOfGinPostingList(seg);
+		/*
+		 * OK, we now have a compressed version of this segment ready for
+		 * copying to the page. Did we exceed the size that fits on one page?
+		 */
+		segsize = SizeOfGinPostingList(seginfo->seg);
 		if (pgused + segsize > GinDataLeafMaxContentSize)
 		{
 			if (!needsplit)
 			{
 				/* switch to right page */
 				Assert(pgused > 0);
-				leaf->lastleft = dlist_tail_node(&leaf->segments);
+				leaf->lastleft = dlist_prev_node(&leaf->segments, cur_node);
 				needsplit = true;
 				leaf->lsize = pgused;
 				pgused = 0;
 			}
 			else
 			{
-				/* filled both pages */
-				*remaining = allitems[totalpacked];
+				/*
+				 * Filled both pages. The last segment we constructed did not
+				 * fit.
+				 */
+				*remaining = seginfo->seg->first;
+
+				/*
+				 * remove any remaining segments that did not fit from the list.
+				 */
+				while (dlist_has_next(&leaf->segments, cur_node))
+					dlist_delete(dlist_next_node(&leaf->segments, cur_node));
+				dlist_delete(cur_node);
 				break;
 			}
 		}
 
-		seginfo = palloc(sizeof(leafSegmentInfo));
-		seginfo->seg = seg;
-		seginfo->items = &allitems[totalpacked];
-		seginfo->nitems = npacked;
-		seginfo->modified = true;
-
-		dlist_push_tail(&leaf->segments, &seginfo->node);
-
 		pgused += segsize;
-		totalpacked += npacked;
 	}
 
 	if (!needsplit)
@@ -1471,6 +1659,31 @@ leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining)
 	Assert(leaf->lsize <= GinDataLeafMaxContentSize);
 	Assert(leaf->rsize <= GinDataLeafMaxContentSize);
 
+	modified = false;
+	dlist_foreach(iter, &leaf->segments)
+	{
+		leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,  iter.cur);
+
+		if (!modified && seginfo->action != GIN_SEGMENT_UNMODIFIED)
+		{
+			modified = true;
+		}
+		/*
+		 * Need to make a palloc'd copy of every segment after the first
+		 * modified one, because as we start copying items to the original
+		 * page, we might overwrite an existing segment.
+		 */
+		else if (modified && seginfo->action == GIN_SEGMENT_UNMODIFIED)
+		{
+			GinPostingList *tmp;
+
+			segsize = SizeOfGinPostingList(seginfo->seg);
+			tmp = palloc(segsize);
+			memcpy(tmp, seginfo->seg, segsize);
+			seginfo->seg = tmp;
+		}
+	}
+
 	return needsplit;
 }
 
diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c
index c46f7ac..5681e14 100644
--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@@ -145,15 +145,159 @@ ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rda
 static void
 ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data)
 {
-	Pointer		segment;
-
-	/* Copy the new data to the right place */
-	segment = ((Pointer) GinDataLeafPageGetPostingList(page))
-		+ data->unmodifiedsize;
-	memcpy(segment, data->newdata, data->length - data->unmodifiedsize);
-	GinDataLeafPageSetPostingListSize(page, data->length);
-	GinPageSetCompressed(page);
-	GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
+	int			nactions = data->nactions;
+	int			actionno;
+	int			segno;
+	GinPostingList *oldseg;
+	Pointer		segmentend;
+	char	   *walbuf;
+	int			totalsize;
+
+	/*
+	 * If the page is in pre-9.4 format, convert to new format first.
+	 */
+	if (!GinPageIsCompressed(page))
+	{
+		ItemPointer uncompressed = (ItemPointer) GinDataPageGetData(page);
+		int			nuncompressed = GinPageGetOpaque(page)->maxoff;
+		int			npacked;
+		GinPostingList *plist;
+
+		plist = ginCompressPostingList(uncompressed, nuncompressed,
+									   BLCKSZ, &npacked);
+		Assert(npacked == nuncompressed);
+
+		totalsize = SizeOfGinPostingList(plist);
+
+		memcpy(GinDataLeafPageGetPostingList(page), plist, totalsize);
+		GinDataLeafPageSetPostingListSize(page, totalsize);
+		GinPageSetCompressed(page);
+		GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
+	}
+
+	oldseg = GinDataLeafPageGetPostingList(page);
+	segmentend = (Pointer) oldseg + GinDataLeafPageGetPostingListSize(page);
+	segno = 0;
+
+	walbuf = ((char *) data) + sizeof(ginxlogRecompressDataLeaf);
+	for (actionno = 0; actionno < nactions; actionno++)
+	{
+		uint8		a_segno = *((uint8 *) (walbuf++));
+		uint8		a_action = *((uint8 *) (walbuf++));
+		GinPostingList *newseg = NULL;
+		int			newsegsize = 0;
+		ItemPointerData *items = NULL;
+		uint16		nitems = 0;
+		ItemPointerData *olditems;
+		int			nolditems;
+		ItemPointerData *newitems;
+		int			nnewitems;
+		int			segsize;
+		Pointer		segptr;
+		int			szleft;
+
+		/* Extract all the information we need from the WAL record */
+		if (a_action == GIN_SEGMENT_INSERT ||
+			a_action == GIN_SEGMENT_REPLACE)
+		{
+			newseg = (GinPostingList *) walbuf;
+			newsegsize = SizeOfGinPostingList(newseg);
+			walbuf += SHORTALIGN(newsegsize);
+		}
+
+		if (a_action == GIN_SEGMENT_ADDITEMS)
+		{
+			memcpy(&nitems, walbuf, sizeof(uint16));
+			walbuf += sizeof(uint16);
+			items = (ItemPointerData *) walbuf;
+			walbuf += nitems * sizeof(ItemPointerData);
+		}
+
+		/* Skip to the segment that this action concerns */
+		Assert(segno <= a_segno);
+		while (segno < a_segno)
+		{
+			oldseg = GinNextPostingListSegment(oldseg);
+			segno++;
+		}
+
+		/*
+		 * ADDITEMS action is handled like REPLACE, but the new segment to
+		 * replace the old one is reconstructed using the old segment from
+		 * disk and the new items from the WAL record.
+		 */
+		if (a_action == GIN_SEGMENT_ADDITEMS)
+		{
+			int			npacked;
+
+			olditems = ginPostingListDecode(oldseg, &nolditems);
+
+			newitems = ginMergeItemPointers(items, nitems,
+											olditems, nolditems,
+											&nnewitems);
+			Assert(nnewitems == nolditems + nitems);
+
+			newseg = ginCompressPostingList(newitems, nnewitems,
+											BLCKSZ, &npacked);
+			Assert(npacked == nnewitems);
+
+			newsegsize = SizeOfGinPostingList(newseg);
+			a_action = GIN_SEGMENT_REPLACE;
+		}
+
+		segptr = (Pointer) oldseg;
+		if (segptr != segmentend)
+			segsize = SizeOfGinPostingList(oldseg);
+		else
+		{
+			/*
+			 * Positioned after the last existing segment. Only INSERTs
+			 * expected here.
+			 */
+			Assert(a_action == GIN_SEGMENT_INSERT);
+			segsize = 0;
+		}
+		szleft = segmentend - segptr;
+
+		switch (a_action)
+		{
+			case GIN_SEGMENT_DELETE:
+				memmove(segptr, segptr + segsize, szleft - segsize);
+				segmentend -= segsize;
+
+				segno++;
+				break;
+
+			case GIN_SEGMENT_INSERT:
+				/* make room for the new segment */
+				memmove(segptr + newsegsize, segptr, szleft);
+				/* copy the new segment in place */
+				memcpy(segptr, newseg, newsegsize);
+				segmentend += newsegsize;
+				segptr += newsegsize;
+				break;
+
+			case GIN_SEGMENT_REPLACE:
+				/* shift the segments that follow */
+				memmove(segptr + newsegsize,
+						segptr + segsize,
+						szleft - segsize);
+				/* copy the replacement segment in place */
+				memcpy(segptr, newseg, newsegsize);
+				segmentend -= segsize;
+				segmentend += newsegsize;
+				segptr += newsegsize;
+				segno++;
+				break;
+
+			default:
+				elog(PANIC, "unexpected GIN leaf action: %u", a_action);
+		}
+		oldseg = (GinPostingList *) segptr;
+	}
+
+	totalsize = segmentend - (Pointer) GinDataLeafPageGetPostingList(page);
+	GinDataLeafPageSetPostingListSize(page, totalsize);
 }
 
 static void
diff --git a/src/backend/access/rmgrdesc/gindesc.c b/src/backend/access/rmgrdesc/gindesc.c
index 1983bcc..aa60c8d 100644
--- a/src/backend/access/rmgrdesc/gindesc.c
+++ b/src/backend/access/rmgrdesc/gindesc.c
@@ -25,6 +25,57 @@ desc_node(StringInfo buf, RelFileNode node, BlockNumber blkno)
 					 node.spcNode, node.dbNode, node.relNode, blkno);
 }
 
+static void
+desc_recompress_leaf(StringInfo buf, ginxlogRecompressDataLeaf *insertData)
+{
+	int			i;
+	char	   *walbuf = ((char *) insertData) + sizeof(ginxlogRecompressDataLeaf);
+
+	appendStringInfo(buf, " %d segments:", (int) insertData->nactions);
+
+	for (i = 0; i < insertData->nactions; i++)
+	{
+		uint8		a_segno = *((uint8 *) (walbuf++));
+		uint8		a_action = *((uint8 *) (walbuf++));
+		uint16		nitems = 0;
+		int			newsegsize = 0;
+
+		if (a_action == GIN_SEGMENT_INSERT ||
+			a_action == GIN_SEGMENT_REPLACE)
+		{
+			newsegsize = SizeOfGinPostingList((GinPostingList *) walbuf);
+			walbuf += SHORTALIGN(newsegsize);
+		}
+
+		if (a_action == GIN_SEGMENT_ADDITEMS)
+		{
+			memcpy(&nitems, walbuf, sizeof(uint16));
+			walbuf += sizeof(uint16);
+			walbuf += nitems * sizeof(ItemPointerData);
+		}
+
+		switch(a_action)
+		{
+			case GIN_SEGMENT_ADDITEMS:
+				appendStringInfo(buf, " %d (add %d items)", a_segno, nitems);
+				break;
+			case GIN_SEGMENT_DELETE:
+				appendStringInfo(buf, " %d (delete)", a_segno);
+				break;
+			case GIN_SEGMENT_INSERT:
+				appendStringInfo(buf, " %d (insert)", a_segno);
+				break;
+			case GIN_SEGMENT_REPLACE:
+				appendStringInfo(buf, " %d (replace)", a_segno);
+				break;
+			default:
+				appendStringInfo(buf, " %d unknown action %d ???", a_segno, a_action);
+				/* cannot decode unrecognized actions further */
+				return;
+		}
+	}
+}
+
 void
 gin_desc(StringInfo buf, uint8 xl_info, char *rec)
 {
@@ -70,9 +121,10 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec)
 					ginxlogRecompressDataLeaf *insertData =
 						(ginxlogRecompressDataLeaf *) payload;
 
-					appendStringInfo(buf, " unmodified: %u length: %u (compressed)",
-									 insertData->unmodifiedsize,
-									 insertData->length);
+					if (xl_info & XLR_BKP_BLOCK(0))
+						appendStringInfo(buf, " (full page image)");
+					else
+						desc_recompress_leaf(buf, insertData);
 				}
 				else
 				{
@@ -105,9 +157,10 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec)
 				ginxlogVacuumDataLeafPage *xlrec = (ginxlogVacuumDataLeafPage *) rec;
 				appendStringInfoString(buf, "Vacuum data leaf page, ");
 				desc_node(buf, xlrec->node, xlrec->blkno);
-				appendStringInfo(buf, " unmodified: %u length: %u",
-							 xlrec->data.unmodifiedsize,
-							 xlrec->data.length);
+				if (xl_info & XLR_BKP_BLOCK(0))
+					appendStringInfo(buf, " (full page image)");
+				else
+					desc_recompress_leaf(buf, &xlrec->data);
 			}
 			break;
 		case XLOG_GIN_DELETE_PAGE:
diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h
index a7beed1..fd83886 100644
--- a/src/include/access/gin_private.h
+++ b/src/include/access/gin_private.h
@@ -406,7 +406,8 @@ typedef struct
 	 * whose split this insertion finishes. As BlockIdData[2] (beware of adding
 	 * fields before this that would make them not 16-bit aligned)
 	 *
-	 * 2. one of the following structs, depending on tree type.
+	 * 2. an ginxlogInsertEntry or ginxlogRecompressDataLeaf struct, depending
+	 * on tree type.
 	 *
 	 * NB: the below structs are only 16-bit aligned when appended to a
 	 * ginxlogInsert struct! Beware of adding fields to them that require
@@ -421,17 +422,36 @@ typedef struct
 	IndexTupleData tuple;	/* variable length */
 } ginxlogInsertEntry;
 
+
 typedef struct
 {
-	uint16		length;
-	uint16		unmodifiedsize;
+	uint16		nactions;
 
-	/* compressed segments, variable length */
-	char		newdata[1];
+	/* Variable number of 'actions' follow */
 } ginxlogRecompressDataLeaf;
 
 typedef struct
 {
+	uint8		segno;		/* segment this action applies to */
+	char		type;		/* action type (see below) */
+
+	/*
+	 * Action-specific data follows. For INSERT and REPLACE actions that is a
+	 * GinPostingList struct. For ADDITEMS, a uint16 for the number of items
+	 * added, followed by the items themselves as ItemPointers. DELETE actions
+	 * have no further data.
+	 */
+} ginxlogSegmentAction;
+
+/* Action types */
+#define GIN_SEGMENT_UNMODIFIED	0	/* no action (not used in WAL records) */
+#define GIN_SEGMENT_DELETE		1	/* a whole segment is removed */
+#define GIN_SEGMENT_INSERT		2	/* a whole segment is added */
+#define GIN_SEGMENT_REPLACE		3	/* a segment is replaced */
+#define GIN_SEGMENT_ADDITEMS	4	/* items are added to existing segment */
+
+typedef struct
+{
 	OffsetNumber offset;
 	PostingItem newitem;
 } ginxlogInsertDataInternal;
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to