I came up with the attached patch, to reduce the WAL volume of GIN
insertions. It become fairly large, but I guess that's not too
surprising as the old WAL-logging method was basically to dump the whole
page to WAL record. This is now a lot more fine-grained and smarter. I
separated constructing the WAL record from copying the changes back to
the disk page, which IMHO is a readability improvement even though it's
more code.
There are two parts to this patch:
* leafRepackItems has been rewritten. The previous coding basically
searched for the first modified item, and decoded and re-encoded
everything on the page that after that. Now it tries harder to avoid
re-encoding segments that are still reasonably sized (between 128 and
384 bytes, with the target for new segments being 256 bytes). This ought
to make random updates faster as a bonus, but I didn't performance test
that.
* Track more carefully which segments on the page have been modified.
The in-memory structure used to manipulate a page now keeps an action
code for each segment, indicating if the segment is completely new,
deleted, or replaced with new content, or if just some new items have
been added to it. These same actions are WAL-logged, and replayed in the
redo routine.
This brings the WAL volume back to the same ballpark as 9.3. Or better,
depending on the operation.
Fujii, Alexander, how does this look to you?
- Heikki
diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c
index 313d53c..56b456e 100644
--- a/src/backend/access/gin/gindatapage.c
+++ b/src/backend/access/gin/gindatapage.c
@@ -22,17 +22,17 @@
#include "utils/rel.h"
/*
- * Size of the posting lists stored on leaf pages, in bytes. The code can
- * deal with any size, but random access is more efficient when a number of
- * smaller lists are stored, rather than one big list.
- */
-#define GinPostingListSegmentMaxSize 256
-
-/*
- * Existing posting lists smaller than this are recompressed, when inserting
- * new items to page.
+ * Min, Max and Target size of posting lists stored on leaf pages, in bytes.
+ *
+ * The code can deal with any size, but random access is more efficient when
+ * a number of smaller lists are stored, rather than one big list. If a
+ * posting list would become larger than Max size as a result of insertions,
+ * it is split into two. If a posting list would be smaller than minimum
+ * size
*/
-#define GinPostingListSegmentMinSize 192
+#define GinPostingListSegmentMaxSize 384
+#define GinPostingListSegmentTargetSize 256
+#define GinPostingListSegmentMinSize 128
/*
* At least this many items fit in a GinPostingListSegmentMaxSize-bytes
@@ -55,12 +55,32 @@ typedef struct
dlist_node *lastleft; /* last segment on left page */
int lsize; /* total size on left page */
int rsize; /* total size on right page */
+
+ bool oldformat; /* page is in pre-9.4 on disk */
} disassembledLeaf;
typedef struct
{
dlist_node node; /* linked list pointers */
+ /*-------------
+ * 'action' indicates the status of this in-memory segment, compared to
+ * what's on disk. It is one of the GIN_SEGMENT_* action codes:
+ *
+ * UNMODIFIED no changes
+ * DELETED the segment is to be removed. 'seg' and 'items' are
+ * ignored
+ * INSERT this is a completely new segment
+ * REPLACE this replaces an existing segment with new content
+ * ADDITEMS like REPLACE, but we track in detail what items have
+ * been added to this segment, in 'modifieditems'
+ *-------------
+ */
+ char action;
+
+ ItemPointerData *modifieditems;
+ int nmodifieditems;
+
/*
* The following fields represent the items in this segment.
* If 'items' is not NULL, it contains a palloc'd array of the items
@@ -72,8 +92,6 @@ typedef struct
GinPostingList *seg;
ItemPointer items;
int nitems; /* # of items in 'items', if items != NULL */
-
- bool modified; /* is this segment on page already? */
} leafSegmentInfo;
static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems);
@@ -87,9 +105,9 @@ static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining);
static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems);
-static void dataPlaceToPageLeafRecompress(Buffer buf,
- disassembledLeaf *leaf,
- XLogRecData **prdata);
+static XLogRecData *constructLeafRecompressWALData(Buffer buf,
+ disassembledLeaf *leaf);
+static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf);
static void dataPlaceToPageLeafSplit(Buffer buf,
disassembledLeaf *leaf,
ItemPointerData lbound, ItemPointerData rbound,
@@ -563,15 +581,21 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
if (!needsplit)
{
/*
- * Great, all the items fit on a single page. Write the segments to
- * the page, and WAL-log appropriately.
+ * Great, all the items fit on a single page. Construct a WAL record
+ * describing the changes we made, and write the segments back to
+ * the page.
*
* Once we start modifying the page, there's no turning back. The
* caller is responsible for calling END_CRIT_SECTION() after writing
* the WAL record.
*/
+ MemoryContextSwitchTo(oldCxt);
+ if (RelationNeedsWAL(btree->index))
+ *prdata = constructLeafRecompressWALData(buf, leaf);
+ else
+ *prdata = NULL;
START_CRIT_SECTION();
- dataPlaceToPageLeafRecompress(buf, leaf, prdata);
+ dataPlaceToPageLeafRecompress(buf, leaf);
if (append)
elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)",
@@ -619,8 +643,6 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
break;
}
- /* don't consider segments moved to right as unmodified */
- lastleftinfo->modified = true;
leaf->lsize -= segsize;
leaf->rsize += segsize;
leaf->lastleft = dlist_prev_node(&leaf->segments, leaf->lastleft);
@@ -716,14 +738,15 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
/* Removing an item never increases the size of the segment */
if (npacked != ncleaned)
elog(ERROR, "could not fit vacuumed posting list");
+ seginfo->action = GIN_SEGMENT_REPLACE;
}
else
{
seginfo->seg = NULL;
seginfo->items = NULL;
+ seginfo->action = GIN_SEGMENT_DELETE;
}
seginfo->nitems = ncleaned;
- seginfo->modified = true;
removedsomething = true;
}
@@ -748,10 +771,33 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
*/
if (removedsomething)
{
- XLogRecData *payloadrdata;
+ XLogRecData *payloadrdata = NULL;
+ bool modified;
+ /*
+ * Make sure we have a palloc'd copy of all segments, after the
+ * first segment that is modified. (dataPlaceToPageLeafRecompress
+ * requires this).
+ */
+ modified = false;
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
+ if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
+ modified = true;
+ if (modified && seginfo->action != GIN_SEGMENT_DELETE)
+ {
+ int segsize = SizeOfGinPostingList(seginfo->seg);
+ GinPostingList *tmp = (GinPostingList *) palloc(segsize);
+ memcpy(tmp, seginfo->seg, segsize);
+ seginfo->seg = tmp;
+ }
+ }
+
+ if (RelationNeedsWAL(indexrel))
+ payloadrdata = constructLeafRecompressWALData(buffer, leaf);
START_CRIT_SECTION();
- dataPlaceToPageLeafRecompress(buffer, leaf, &payloadrdata);
+ dataPlaceToPageLeafRecompress(buffer, leaf);
MarkBufferDirty(buffer);
@@ -778,96 +824,168 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
}
/*
+ * Construct a ginxlogRecompressDataLeaf record representing the changes
+ * in *leaf.
+ */
+static XLogRecData *
+constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf)
+{
+ int nmodified = 0;
+ char *walbufbegin;
+ char *walbufend;
+ XLogRecData *rdata;
+ dlist_iter iter;
+ int segno;
+ ginxlogRecompressDataLeaf *recompress_xlog;
+
+ /* Count the modified segments */
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+ iter.cur);
+
+ if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
+ nmodified++;
+ }
+
+ walbufbegin = palloc(
+ sizeof(ginxlogRecompressDataLeaf) +
+ BLCKSZ + /* max size needed to hold the segment data */
+ nmodified * 2 + /* (segno + action) per action */
+ sizeof(XLogRecData));
+ walbufend = walbufbegin;
+
+ recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend;
+ walbufend += sizeof(ginxlogRecompressDataLeaf);
+
+ recompress_xlog->nactions = nmodified;
+
+ segno = 0;
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
+ int segsize = 0;
+ int datalen;
+ uint8 action = seginfo->action;
+
+ if (action == GIN_SEGMENT_UNMODIFIED)
+ {
+ segno++;
+ continue;
+ }
+
+ if (action != GIN_SEGMENT_DELETE)
+ segsize = SizeOfGinPostingList(seginfo->seg);
+
+ /*
+ * If storing the uncompressed list of added item pointers would take
+ * more space than storing the compressed segment as is, do that
+ * instead.
+ */
+ if (action == GIN_SEGMENT_ADDITEMS &&
+ seginfo->nmodifieditems * sizeof(ItemPointerData) > segsize)
+ {
+ action = GIN_SEGMENT_REPLACE;
+ }
+
+ *((uint8 *) (walbufend++)) = segno;
+ *(walbufend++) = action;
+
+ switch (action)
+ {
+ case GIN_SEGMENT_DELETE:
+ datalen = 0;
+ break;
+
+ case GIN_SEGMENT_ADDITEMS:
+ datalen = seginfo->nmodifieditems * sizeof(ItemPointerData);
+ *((uint16 *) walbufend) = seginfo->nmodifieditems;
+ memcpy(walbufend + sizeof(uint16), seginfo->modifieditems, datalen);
+ datalen += sizeof(uint16);
+ break;
+
+ case GIN_SEGMENT_INSERT:
+ case GIN_SEGMENT_REPLACE:
+ datalen = SHORTALIGN(segsize);
+ memcpy(walbufend, seginfo->seg, segsize);
+ break;
+
+ default:
+ elog(ERROR, "unexpected GIN leaf action %d", action);
+ }
+ walbufend += datalen;
+
+ if (action != GIN_SEGMENT_INSERT)
+ segno++;
+ }
+
+ rdata = (XLogRecData *) MAXALIGN(walbufend);
+ rdata->buffer = buf;
+ rdata->buffer_std = TRUE;
+ rdata->data = walbufbegin;
+ rdata->len = walbufend - walbufbegin;
+ rdata->next = NULL;
+
+ return rdata;
+}
+
+/*
* Assemble a disassembled posting tree leaf page back to a buffer.
*
* *prdata is filled with WAL information about this operation. The caller
* is responsible for inserting to the WAL, along with any other information
* about the operation that triggered this recompression.
*
- * NOTE: The segment pointers can point directly to the same buffer, with
- * the limitation that any earlier segment must not overlap with an original,
- * later segment. In other words, some segments may point the original buffer
- * as long as you don't make any segments larger. Currently, leafRepackItems
- * satisies this rule because it rewrites all segments after the first
- * modified one, and vacuum can only make segments shorter.
+ * NOTE: The segment pointers must not point directly to the same buffer,
+ * except for segments that have not been modified, and none of the preceding
+ * segments have been modified either.
*/
static void
-dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf,
- XLogRecData **prdata)
+dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf)
{
Page page = BufferGetPage(buf);
char *ptr;
int newsize;
- int unmodifiedsize;
bool modified = false;
dlist_iter iter;
+ int segsize;
+
/*
- * these must be static so they can be returned to caller (no pallocs
- * since we're in a critical section!)
+ * If the page was in pre-9.4 format before, convert the header, and
+ * force all segments to be copied to the page whether they were modified
+ * or not.
*/
- static ginxlogRecompressDataLeaf recompress_xlog;
- static XLogRecData rdata[2];
+ if (!GinPageIsCompressed(page))
+ {
+ Assert(leaf->oldformat);
+ GinPageSetCompressed(page);
+ GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
+ modified = true;
+ }
ptr = (char *) GinDataLeafPageGetPostingList(page);
newsize = 0;
- unmodifiedsize = 0;
- modified = false;
dlist_foreach(iter, &leaf->segments)
{
leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
- int segsize;
- if (seginfo->modified)
+ if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
modified = true;
- /*
- * Nothing to do with empty segments, except keep track if they've been
- * modified
- */
- if (seginfo->seg == NULL)
- {
- Assert(seginfo->items == NULL);
- continue;
- }
+ if (seginfo->action != GIN_SEGMENT_DELETE)
+ segsize = SizeOfGinPostingList(seginfo->seg);
+ else
+ segsize = 0;
- segsize = SizeOfGinPostingList(seginfo->seg);
+ if (modified)
+ memcpy(ptr, seginfo->seg, segsize);
- if (!modified)
- unmodifiedsize += segsize;
- else
- {
- /*
- * Use memmove rather than memcpy, in case the segment points
- * to the same buffer
- */
- memmove(ptr, seginfo->seg, segsize);
- }
ptr += segsize;
newsize += segsize;
}
+
Assert(newsize <= GinDataLeafMaxContentSize);
GinDataLeafPageSetPostingListSize(page, newsize);
-
- /* Reset these in case the page was in pre-9.4 format before */
- GinPageSetCompressed(page);
- GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
-
- /* Put WAL data */
- recompress_xlog.length = (uint16) newsize;
- recompress_xlog.unmodifiedsize = unmodifiedsize;
-
- rdata[0].buffer = InvalidBuffer;
- rdata[0].data = (char *) &recompress_xlog;
- rdata[0].len = offsetof(ginxlogRecompressDataLeaf, newdata);
- rdata[0].next = &rdata[1];
-
- rdata[1].buffer = buf;
- rdata[1].buffer_std = TRUE;
- rdata[1].data = ((char *) GinDataLeafPageGetPostingList(page)) + unmodifiedsize;
- rdata[1].len = newsize - unmodifiedsize;
- rdata[1].next = NULL;
-
- *prdata = rdata;
}
/*
@@ -914,9 +1032,12 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf,
seginfo = dlist_container(leafSegmentInfo, node, node);
segsize = SizeOfGinPostingList(seginfo->seg);
- memcpy(ptr, seginfo->seg, segsize);
- ptr += segsize;
- lsize += segsize;
+ if (seginfo->action != GIN_SEGMENT_DELETE)
+ {
+ memcpy(ptr, seginfo->seg, segsize);
+ ptr += segsize;
+ lsize += segsize;
+ }
}
Assert(lsize == leaf->lsize);
GinDataLeafPageSetPostingListSize(lpage, lsize);
@@ -932,9 +1053,12 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf,
seginfo = dlist_container(leafSegmentInfo, node, node);
segsize = SizeOfGinPostingList(seginfo->seg);
- memcpy(ptr, seginfo->seg, segsize);
- ptr += segsize;
- rsize += segsize;
+ if (seginfo->action != GIN_SEGMENT_DELETE)
+ {
+ memcpy(ptr, seginfo->seg, segsize);
+ ptr += segsize;
+ rsize += segsize;
+ }
if (!dlist_has_next(&leaf->segments, node))
break;
@@ -1195,14 +1319,15 @@ disassembleLeaf(Page page)
{
leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo));
+ seginfo->action = GIN_SEGMENT_UNMODIFIED;
seginfo->seg = seg;
seginfo->items = NULL;
seginfo->nitems = 0;
- seginfo->modified = false;
dlist_push_tail(&leaf->segments, &seginfo->node);
seg = GinNextPostingListSegment(seg);
}
+ leaf->oldformat = false;
}
else
{
@@ -1218,14 +1343,15 @@ disassembleLeaf(Page page)
seginfo = palloc(sizeof(leafSegmentInfo));
+ seginfo->action = GIN_SEGMENT_REPLACE;
seginfo->seg = NULL;
seginfo->items = palloc(nuncompressed * sizeof(ItemPointerData));
memcpy(seginfo->items, uncompressed, nuncompressed * sizeof(ItemPointerData));
seginfo->nitems = nuncompressed;
- /* make sure we rewrite this to disk */
- seginfo->modified = true;
dlist_push_tail(&leaf->segments, &seginfo->node);
+
+ leaf->oldformat = true;
}
return leaf;
@@ -1247,6 +1373,7 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
ItemPointer nextnew = newItems;
int newleft = nNewItems;
bool modified = false;
+ leafSegmentInfo *newseg;
/*
* If the page is completely empty, just construct one new segment to
@@ -1254,14 +1381,12 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
*/
if (dlist_is_empty(&leaf->segments))
{
- /* create a new segment for the new entries */
- leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo));
-
- seginfo->seg = NULL;
- seginfo->items = newItems;
- seginfo->nitems = nNewItems;
- seginfo->modified = true;
- dlist_push_tail(&leaf->segments, &seginfo->node);
+ newseg = palloc(sizeof(leafSegmentInfo));
+ newseg->seg = NULL;
+ newseg->items = newItems;
+ newseg->nitems = nNewItems;
+ newseg->action = GIN_SEGMENT_INSERT;
+ dlist_push_tail(&leaf->segments, &newseg->node);
return true;
}
@@ -1303,15 +1428,51 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
if (!cur->items)
cur->items = ginPostingListDecode(cur->seg, &cur->nitems);
+ /*
+ * Fast path for the important special case that we're appending to
+ * the end of the page: don't let the last segment on the page grow
+ * larger than the target, create a new segment before that happens.
+ */
+ if (!dlist_has_next(&leaf->segments, iter.cur) &&
+ ginCompareItemPointers(&cur->items[cur->nitems - 1], &nextnew[0]) > 0 &&
+ cur->seg != NULL &&
+ SizeOfGinPostingList(cur->seg) >= GinPostingListSegmentTargetSize)
+ {
+ newseg = palloc(sizeof(leafSegmentInfo));
+ newseg->seg = NULL;
+ newseg->items = nextnew;
+ newseg->nitems = nthis;
+ newseg->action = GIN_SEGMENT_INSERT;
+ dlist_push_tail(&leaf->segments, &newseg->node);
+ modified = true;
+ break;
+ }
+
tmpitems = ginMergeItemPointers(cur->items, cur->nitems,
nextnew, nthis,
&ntmpitems);
if (ntmpitems != cur->nitems)
{
+ /*
+ * If there are no duplicates, track the added items so that we
+ * can emit a compact ADDITEMS WAL record later on. (it doesn't
+ * seem worth re-checking which items were duplicates, if there
+ * were any)
+ */
+ if (ntmpitems == nthis + cur->nitems &&
+ cur->action == GIN_SEGMENT_UNMODIFIED)
+ {
+ cur->action = GIN_SEGMENT_ADDITEMS;
+ cur->modifieditems = nextnew;
+ cur->nmodifieditems = nthis;
+ }
+ else
+ cur->action = GIN_SEGMENT_REPLACE;
+
cur->items = tmpitems;
cur->nitems = ntmpitems;
cur->seg = NULL;
- modified = cur->modified = true;
+ modified = true;
}
nextnew += nthis;
@@ -1331,133 +1492,160 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
* If all items fit, *remaining is set to invalid.
*
* Returns true if the page has to be split.
- *
- * XXX: Actually, this re-encodes all segments after the first one that was
- * modified, to make sure the new segments are all more or less of equal
- * length. That's unnecessarily aggressive; if we've only added a single item
- * to one segment, for example, we could re-encode just that single segment,
- * as long as it's still smaller than, say, 2x the normal segment size.
*/
static bool
leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining)
{
- dlist_iter iter;
- ItemPointer allitems;
- int nallitems;
int pgused = 0;
- bool needsplit;
- int totalpacked;
- dlist_mutable_iter miter;
- dlist_head recode_list;
- int nrecode;
- bool recoding;
+ bool needsplit = false;
+ dlist_iter iter;
+ int segsize;
+ leafSegmentInfo *nextseg;
+ int npacked;
+ bool modified;
+ dlist_node *cur_node;
+ dlist_node *next_node;
ItemPointerSetInvalid(remaining);
/*
- * Find the first segment that needs to be re-coded. Move all segments
- * that need recoding to separate list, and count the total number of
- * items in them. Also, add up the number of bytes in unmodified segments
- * (pgused).
+ * cannot use dlist_foreach_modify here because we insert adjacent
+ * items while iterating.
*/
- dlist_init(&recode_list);
- recoding = false;
- nrecode = 0;
- pgused = 0;
- dlist_foreach_modify(miter, &leaf->segments)
+ for (cur_node = dlist_head_node(&leaf->segments);
+ cur_node != NULL;
+ cur_node = next_node)
{
- leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, miter.cur);
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, cur_node);
- /* If the segment was modified, re-encode it */
- if (seginfo->modified || seginfo->seg == NULL)
- recoding = true;
- /*
- * Also re-encode abnormally small or large segments. (Vacuum can
- * leave behind small segments, and conversion from pre-9.4 format
- * can leave behind large segments).
- */
- else if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize)
- recoding = true;
- else if (SizeOfGinPostingList(seginfo->seg) > GinPostingListSegmentMaxSize)
- recoding = true;
+ if (dlist_has_next(&leaf->segments, cur_node))
+ next_node = dlist_next_node(&leaf->segments, cur_node);
+ else
+ next_node = NULL;
- if (recoding)
+ /* Compress the posting list, if necessary */
+ if (seginfo->action != GIN_SEGMENT_DELETE)
{
- if (!seginfo->items)
- seginfo->items = ginPostingListDecode(seginfo->seg,
- &seginfo->nitems);
- nrecode += seginfo->nitems;
-
- dlist_delete(miter.cur);
- dlist_push_tail(&recode_list, &seginfo->node);
- }
- else
- pgused += SizeOfGinPostingList(seginfo->seg);
- }
+ if (seginfo->seg == NULL)
+ {
+ if (seginfo->nitems > GinPostingListSegmentMaxSize)
+ npacked = 0; /* no chance that it would fit. */
+ else
+ {
+ seginfo->seg = ginCompressPostingList(seginfo->items,
+ seginfo->nitems,
+ GinPostingListSegmentMaxSize,
+ &npacked);
+ }
+ if (npacked != seginfo->nitems)
+ {
+ /*
+ * Too large. Compress again to the target size, and create
+ * a new segment to represent the remaining items. The new
+ * segment is inserted after this one, so it will be
+ * processed in the next iteration of this loop.
+ */
+ if (seginfo->seg)
+ pfree(seginfo->seg);
+ seginfo->seg = ginCompressPostingList(seginfo->items,
+ seginfo->nitems,
+ GinPostingListSegmentTargetSize,
+ &npacked);
+ if (seginfo->action != GIN_SEGMENT_INSERT)
+ seginfo->action = GIN_SEGMENT_REPLACE;
+
+ nextseg = palloc(sizeof(leafSegmentInfo));
+ nextseg->action = GIN_SEGMENT_INSERT;
+ nextseg->seg = NULL;
+ nextseg->items = &seginfo->items[npacked];
+ nextseg->nitems = seginfo->nitems - npacked;
+ next_node = &nextseg->node;
+ dlist_insert_after(cur_node, next_node);
+ }
+ }
- if (nrecode == 0)
- return false; /* nothing to do */
+ /*
+ * If the segment is very small, merge it with the next
+ * segment.
+ */
+ if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize && next_node)
+ {
+ int nmerged;
+
+ nextseg = dlist_container(leafSegmentInfo, node, next_node);
+
+ if (seginfo->items == NULL)
+ seginfo->items = ginPostingListDecode(seginfo->seg,
+ &seginfo->nitems);
+ if (nextseg->items == NULL)
+ nextseg->items = ginPostingListDecode(nextseg->seg,
+ &nextseg->nitems);
+ nextseg->items =
+ ginMergeItemPointers(seginfo->items, seginfo->nitems,
+ nextseg->items, nextseg->nitems,
+ &nmerged);
+ Assert(nmerged == seginfo->nitems + nextseg->nitems);
+ nextseg->nitems = nmerged;
+ nextseg->seg = NULL;
+
+ nextseg->action = GIN_SEGMENT_REPLACE;
+ nextseg->modifieditems = NULL;
+ nextseg->nmodifieditems = 0;
+
+ if (seginfo->action == GIN_SEGMENT_INSERT)
+ {
+ dlist_delete(cur_node);
+ continue;
+ }
+ else
+ {
+ seginfo->action = GIN_SEGMENT_DELETE;
+ seginfo->seg = NULL;
+ }
+ }
- /*
- * Construct one big array of the items that need to be re-encoded.
- */
- allitems = palloc(nrecode * sizeof(ItemPointerData));
- nallitems = 0;
- dlist_foreach(iter, &recode_list)
- {
- leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
- memcpy(&allitems[nallitems], seginfo->items, seginfo->nitems * sizeof(ItemPointerData));
- nallitems += seginfo->nitems;
- }
- Assert(nallitems == nrecode);
+ seginfo->items = NULL;
+ seginfo->nitems = 0;
+ }
- /*
- * Start packing the items into segments. Stop when we have consumed
- * enough space to fill both pages, or we run out of items.
- */
- totalpacked = 0;
- needsplit = false;
- while (totalpacked < nallitems)
- {
- leafSegmentInfo *seginfo;
- int npacked;
- GinPostingList *seg;
- int segsize;
+ if (seginfo->action == GIN_SEGMENT_DELETE)
+ continue;
- seg = ginCompressPostingList(&allitems[totalpacked],
- nallitems - totalpacked,
- GinPostingListSegmentMaxSize,
- &npacked);
- segsize = SizeOfGinPostingList(seg);
+ /*
+ * OK, we now have a compressed version of this segment ready for
+ * copying to the page. Did we exceed the size that fits on one page?
+ */
+ segsize = SizeOfGinPostingList(seginfo->seg);
if (pgused + segsize > GinDataLeafMaxContentSize)
{
if (!needsplit)
{
/* switch to right page */
Assert(pgused > 0);
- leaf->lastleft = dlist_tail_node(&leaf->segments);
+ leaf->lastleft = dlist_prev_node(&leaf->segments, cur_node);
needsplit = true;
leaf->lsize = pgused;
pgused = 0;
}
else
{
- /* filled both pages */
- *remaining = allitems[totalpacked];
+ /*
+ * Filled both pages. The last segment we constructed did not
+ * fit.
+ */
+ *remaining = seginfo->seg->first;
+
+ /*
+ * remove any remaining segments that did not fit from the list.
+ */
+ while (dlist_has_next(&leaf->segments, cur_node))
+ dlist_delete(dlist_next_node(&leaf->segments, cur_node));
+ dlist_delete(cur_node);
break;
}
}
- seginfo = palloc(sizeof(leafSegmentInfo));
- seginfo->seg = seg;
- seginfo->items = &allitems[totalpacked];
- seginfo->nitems = npacked;
- seginfo->modified = true;
-
- dlist_push_tail(&leaf->segments, &seginfo->node);
-
pgused += segsize;
- totalpacked += npacked;
}
if (!needsplit)
@@ -1471,6 +1659,31 @@ leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining)
Assert(leaf->lsize <= GinDataLeafMaxContentSize);
Assert(leaf->rsize <= GinDataLeafMaxContentSize);
+ modified = false;
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
+
+ if (!modified && seginfo->action != GIN_SEGMENT_UNMODIFIED)
+ {
+ modified = true;
+ }
+ /*
+ * Need to make a palloc'd copy of every segment after the first
+ * modified one, because as we start copying items to the original
+ * page, we might overwrite an existing segment.
+ */
+ else if (modified && seginfo->action == GIN_SEGMENT_UNMODIFIED)
+ {
+ GinPostingList *tmp;
+
+ segsize = SizeOfGinPostingList(seginfo->seg);
+ tmp = palloc(segsize);
+ memcpy(tmp, seginfo->seg, segsize);
+ seginfo->seg = tmp;
+ }
+ }
+
return needsplit;
}
diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c
index c46f7ac..5681e14 100644
--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@@ -145,15 +145,159 @@ ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rda
static void
ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data)
{
- Pointer segment;
-
- /* Copy the new data to the right place */
- segment = ((Pointer) GinDataLeafPageGetPostingList(page))
- + data->unmodifiedsize;
- memcpy(segment, data->newdata, data->length - data->unmodifiedsize);
- GinDataLeafPageSetPostingListSize(page, data->length);
- GinPageSetCompressed(page);
- GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
+ int nactions = data->nactions;
+ int actionno;
+ int segno;
+ GinPostingList *oldseg;
+ Pointer segmentend;
+ char *walbuf;
+ int totalsize;
+
+ /*
+ * If the page is in pre-9.4 format, convert to new format first.
+ */
+ if (!GinPageIsCompressed(page))
+ {
+ ItemPointer uncompressed = (ItemPointer) GinDataPageGetData(page);
+ int nuncompressed = GinPageGetOpaque(page)->maxoff;
+ int npacked;
+ GinPostingList *plist;
+
+ plist = ginCompressPostingList(uncompressed, nuncompressed,
+ BLCKSZ, &npacked);
+ Assert(npacked == nuncompressed);
+
+ totalsize = SizeOfGinPostingList(plist);
+
+ memcpy(GinDataLeafPageGetPostingList(page), plist, totalsize);
+ GinDataLeafPageSetPostingListSize(page, totalsize);
+ GinPageSetCompressed(page);
+ GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
+ }
+
+ oldseg = GinDataLeafPageGetPostingList(page);
+ segmentend = (Pointer) oldseg + GinDataLeafPageGetPostingListSize(page);
+ segno = 0;
+
+ walbuf = ((char *) data) + sizeof(ginxlogRecompressDataLeaf);
+ for (actionno = 0; actionno < nactions; actionno++)
+ {
+ uint8 a_segno = *((uint8 *) (walbuf++));
+ uint8 a_action = *((uint8 *) (walbuf++));
+ GinPostingList *newseg = NULL;
+ int newsegsize = 0;
+ ItemPointerData *items = NULL;
+ uint16 nitems = 0;
+ ItemPointerData *olditems;
+ int nolditems;
+ ItemPointerData *newitems;
+ int nnewitems;
+ int segsize;
+ Pointer segptr;
+ int szleft;
+
+ /* Extract all the information we need from the WAL record */
+ if (a_action == GIN_SEGMENT_INSERT ||
+ a_action == GIN_SEGMENT_REPLACE)
+ {
+ newseg = (GinPostingList *) walbuf;
+ newsegsize = SizeOfGinPostingList(newseg);
+ walbuf += SHORTALIGN(newsegsize);
+ }
+
+ if (a_action == GIN_SEGMENT_ADDITEMS)
+ {
+ memcpy(&nitems, walbuf, sizeof(uint16));
+ walbuf += sizeof(uint16);
+ items = (ItemPointerData *) walbuf;
+ walbuf += nitems * sizeof(ItemPointerData);
+ }
+
+ /* Skip to the segment that this action concerns */
+ Assert(segno <= a_segno);
+ while (segno < a_segno)
+ {
+ oldseg = GinNextPostingListSegment(oldseg);
+ segno++;
+ }
+
+ /*
+ * ADDITEMS action is handled like REPLACE, but the new segment to
+ * replace the old one is reconstructed using the old segment from
+ * disk and the new items from the WAL record.
+ */
+ if (a_action == GIN_SEGMENT_ADDITEMS)
+ {
+ int npacked;
+
+ olditems = ginPostingListDecode(oldseg, &nolditems);
+
+ newitems = ginMergeItemPointers(items, nitems,
+ olditems, nolditems,
+ &nnewitems);
+ Assert(nnewitems == nolditems + nitems);
+
+ newseg = ginCompressPostingList(newitems, nnewitems,
+ BLCKSZ, &npacked);
+ Assert(npacked == nnewitems);
+
+ newsegsize = SizeOfGinPostingList(newseg);
+ a_action = GIN_SEGMENT_REPLACE;
+ }
+
+ segptr = (Pointer) oldseg;
+ if (segptr != segmentend)
+ segsize = SizeOfGinPostingList(oldseg);
+ else
+ {
+ /*
+ * Positioned after the last existing segment. Only INSERTs
+ * expected here.
+ */
+ Assert(a_action == GIN_SEGMENT_INSERT);
+ segsize = 0;
+ }
+ szleft = segmentend - segptr;
+
+ switch (a_action)
+ {
+ case GIN_SEGMENT_DELETE:
+ memmove(segptr, segptr + segsize, szleft - segsize);
+ segmentend -= segsize;
+
+ segno++;
+ break;
+
+ case GIN_SEGMENT_INSERT:
+ /* make room for the new segment */
+ memmove(segptr + newsegsize, segptr, szleft);
+ /* copy the new segment in place */
+ memcpy(segptr, newseg, newsegsize);
+ segmentend += newsegsize;
+ segptr += newsegsize;
+ break;
+
+ case GIN_SEGMENT_REPLACE:
+ /* shift the segments that follow */
+ memmove(segptr + newsegsize,
+ segptr + segsize,
+ szleft - segsize);
+ /* copy the replacement segment in place */
+ memcpy(segptr, newseg, newsegsize);
+ segmentend -= segsize;
+ segmentend += newsegsize;
+ segptr += newsegsize;
+ segno++;
+ break;
+
+ default:
+ elog(PANIC, "unexpected GIN leaf action: %u", a_action);
+ }
+ oldseg = (GinPostingList *) segptr;
+ }
+
+ totalsize = segmentend - (Pointer) GinDataLeafPageGetPostingList(page);
+ GinDataLeafPageSetPostingListSize(page, totalsize);
}
static void
diff --git a/src/backend/access/rmgrdesc/gindesc.c b/src/backend/access/rmgrdesc/gindesc.c
index 1983bcc..aa60c8d 100644
--- a/src/backend/access/rmgrdesc/gindesc.c
+++ b/src/backend/access/rmgrdesc/gindesc.c
@@ -25,6 +25,57 @@ desc_node(StringInfo buf, RelFileNode node, BlockNumber blkno)
node.spcNode, node.dbNode, node.relNode, blkno);
}
+static void
+desc_recompress_leaf(StringInfo buf, ginxlogRecompressDataLeaf *insertData)
+{
+ int i;
+ char *walbuf = ((char *) insertData) + sizeof(ginxlogRecompressDataLeaf);
+
+ appendStringInfo(buf, " %d segments:", (int) insertData->nactions);
+
+ for (i = 0; i < insertData->nactions; i++)
+ {
+ uint8 a_segno = *((uint8 *) (walbuf++));
+ uint8 a_action = *((uint8 *) (walbuf++));
+ uint16 nitems = 0;
+ int newsegsize = 0;
+
+ if (a_action == GIN_SEGMENT_INSERT ||
+ a_action == GIN_SEGMENT_REPLACE)
+ {
+ newsegsize = SizeOfGinPostingList((GinPostingList *) walbuf);
+ walbuf += SHORTALIGN(newsegsize);
+ }
+
+ if (a_action == GIN_SEGMENT_ADDITEMS)
+ {
+ memcpy(&nitems, walbuf, sizeof(uint16));
+ walbuf += sizeof(uint16);
+ walbuf += nitems * sizeof(ItemPointerData);
+ }
+
+ switch(a_action)
+ {
+ case GIN_SEGMENT_ADDITEMS:
+ appendStringInfo(buf, " %d (add %d items)", a_segno, nitems);
+ break;
+ case GIN_SEGMENT_DELETE:
+ appendStringInfo(buf, " %d (delete)", a_segno);
+ break;
+ case GIN_SEGMENT_INSERT:
+ appendStringInfo(buf, " %d (insert)", a_segno);
+ break;
+ case GIN_SEGMENT_REPLACE:
+ appendStringInfo(buf, " %d (replace)", a_segno);
+ break;
+ default:
+ appendStringInfo(buf, " %d unknown action %d ???", a_segno, a_action);
+ /* cannot decode unrecognized actions further */
+ return;
+ }
+ }
+}
+
void
gin_desc(StringInfo buf, uint8 xl_info, char *rec)
{
@@ -70,9 +121,10 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec)
ginxlogRecompressDataLeaf *insertData =
(ginxlogRecompressDataLeaf *) payload;
- appendStringInfo(buf, " unmodified: %u length: %u (compressed)",
- insertData->unmodifiedsize,
- insertData->length);
+ if (xl_info & XLR_BKP_BLOCK(0))
+ appendStringInfo(buf, " (full page image)");
+ else
+ desc_recompress_leaf(buf, insertData);
}
else
{
@@ -105,9 +157,10 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec)
ginxlogVacuumDataLeafPage *xlrec = (ginxlogVacuumDataLeafPage *) rec;
appendStringInfoString(buf, "Vacuum data leaf page, ");
desc_node(buf, xlrec->node, xlrec->blkno);
- appendStringInfo(buf, " unmodified: %u length: %u",
- xlrec->data.unmodifiedsize,
- xlrec->data.length);
+ if (xl_info & XLR_BKP_BLOCK(0))
+ appendStringInfo(buf, " (full page image)");
+ else
+ desc_recompress_leaf(buf, &xlrec->data);
}
break;
case XLOG_GIN_DELETE_PAGE:
diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h
index a7beed1..fd83886 100644
--- a/src/include/access/gin_private.h
+++ b/src/include/access/gin_private.h
@@ -406,7 +406,8 @@ typedef struct
* whose split this insertion finishes. As BlockIdData[2] (beware of adding
* fields before this that would make them not 16-bit aligned)
*
- * 2. one of the following structs, depending on tree type.
+ * 2. an ginxlogInsertEntry or ginxlogRecompressDataLeaf struct, depending
+ * on tree type.
*
* NB: the below structs are only 16-bit aligned when appended to a
* ginxlogInsert struct! Beware of adding fields to them that require
@@ -421,17 +422,36 @@ typedef struct
IndexTupleData tuple; /* variable length */
} ginxlogInsertEntry;
+
typedef struct
{
- uint16 length;
- uint16 unmodifiedsize;
+ uint16 nactions;
- /* compressed segments, variable length */
- char newdata[1];
+ /* Variable number of 'actions' follow */
} ginxlogRecompressDataLeaf;
typedef struct
{
+ uint8 segno; /* segment this action applies to */
+ char type; /* action type (see below) */
+
+ /*
+ * Action-specific data follows. For INSERT and REPLACE actions that is a
+ * GinPostingList struct. For ADDITEMS, a uint16 for the number of items
+ * added, followed by the items themselves as ItemPointers. DELETE actions
+ * have no further data.
+ */
+} ginxlogSegmentAction;
+
+/* Action types */
+#define GIN_SEGMENT_UNMODIFIED 0 /* no action (not used in WAL records) */
+#define GIN_SEGMENT_DELETE 1 /* a whole segment is removed */
+#define GIN_SEGMENT_INSERT 2 /* a whole segment is added */
+#define GIN_SEGMENT_REPLACE 3 /* a segment is replaced */
+#define GIN_SEGMENT_ADDITEMS 4 /* items are added to existing segment */
+
+typedef struct
+{
OffsetNumber offset;
PostingItem newitem;
} ginxlogInsertDataInternal;
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers