Re: [HACKERS] [WIP] Effective storage of duplicates in B-tree index.

Anastasia Lubennikova Thu, 04 Jul 2019 05:07:52 -0700

The new version of the patch is attached.
This version is even simpler than the previous one,
thanks to the recent btree design changes and all the feedback I received.
I consider it ready for review and testing.


[feature overview]

This patch implements the deduplication of btree non-pivot tuples onleaf pages

in a manner similar to GIN index "posting lists".

Non-pivot posting tuple has following format:
t_tid | t_info | key values | posting_list[]

Where t_tid and t_info fields are used to store meta info
about tuple's posting list.
posting list is an array of ItemPointerData.

Currently, compression is applied to all indexes except system indexes,unique

indexes, and indexes with included columns.

On insertion, compression applied not to each tuple, but to the page before
split. If the target page is full, we try to compress it.

[benchmark results]
idx ON tbl(c1);
index contains 10000000 integer values

i - number of distinct values in the index.
So i=1 means that all rows have the same key,
and i=10000000 means that all keys are different.

i / old size (MB) / new size (MB)
1            215    88
1000        215    90
100000        215    71
10000000    214    214

For more, see the attached diagram with test results.

[future work]
Many things can be improved in this feature.
Personally, I'd prefer to keep this patch as small as possible
and work on other improvements after a basic part is committed.
Though, I understand that some of these can be considered essential
for this patch to be approved.

1. Implement a split of the posting tuples on a page split.
2. Implement microvacuum of posting tuples.
3. Add a flag into pg_index, which allows enabling/disabling compression
for a particular index.
4. Implement posting list compression.

--
Anastasia Lubennikova
Postgres Professional:http://www.postgrespro.com
The Russian Postgres Company

diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 602f884..fce499b 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -20,6 +20,7 @@
 #include "access/tableam.h"
 #include "access/transam.h"
 #include "access/xloginsert.h"
+#include "catalog/catalog.h"
 #include "miscadmin.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
@@ -56,6 +57,8 @@ static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
 static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
 						 OffsetNumber itup_off);
 static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
+static bool insert_itupprev_to_page(Page page, BTCompressState *compressState);
+static void _bt_compress_one_page(Relation rel, Buffer buffer, Relation heapRel);
 
 /*
  *	_bt_doinsert() -- Handle insertion of a single index tuple in the tree.
@@ -759,6 +762,12 @@ _bt_findinsertloc(Relation rel,
 			_bt_vacuum_one_page(rel, insertstate->buf, heapRel);
 			insertstate->bounds_valid = false;
 		}
+
+		/*
+		 * If the target page is full, try to compress the page
+		 */
+		if (PageGetFreeSpace(page) < insertstate->itemsz)
+			_bt_compress_one_page(rel, insertstate->buf, heapRel);
 	}
 	else
 	{
@@ -806,6 +815,11 @@ _bt_findinsertloc(Relation rel,
 			}
 
 			/*
+			 * Before considering moving right, try to compress the page
+			 */
+			_bt_compress_one_page(rel, insertstate->buf, heapRel);
+
+			/*
 			 * Nope, so check conditions (b) and (c) enumerated above
 			 *
 			 * The earlier _bt_check_unique() call may well have established a
@@ -2286,3 +2300,232 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
 	 * the page.
 	 */
 }
+
+/*
+ * Add new item (compressed or not) to the page, while compressing it.
+ * If insertion failed, return false.
+ * Caller should consider this as compression failure and
+ * leave page uncompressed.
+ */
+static bool
+insert_itupprev_to_page(Page page, BTCompressState *compressState)
+{
+	IndexTuple to_insert;
+	OffsetNumber offnum =  PageGetMaxOffsetNumber(page);
+
+	if (compressState->ntuples == 0)
+		to_insert = compressState->itupprev;
+	else
+	{
+		IndexTuple postingtuple;
+		/* form a tuple with a posting list */
+		postingtuple = BTreeFormPostingTuple(compressState->itupprev,
+											compressState->ipd,
+											compressState->ntuples);
+		to_insert = postingtuple;
+		pfree(compressState->ipd);
+	}
+
+	/* Add the new item into the page */
+	offnum = OffsetNumberNext(offnum);
+
+	elog(DEBUG4, "insert_itupprev_to_page. compressState->ntuples %d IndexTupleSize %zu free %zu",
+			 compressState->ntuples, IndexTupleSize(to_insert), PageGetFreeSpace(page));
+
+	if (PageAddItem(page, (Item) to_insert, IndexTupleSize(to_insert),
+					offnum, false, false) == InvalidOffsetNumber)
+	{
+		elog(DEBUG4, "insert_itupprev_to_page. failed");
+		/*
+		 * this may happen if tuple is bigger than freespace
+		 * fallback to uncompressed page case
+		 */
+		if (compressState->ntuples > 0)
+			pfree(to_insert);
+		return false;
+	}
+
+	if (compressState->ntuples > 0)
+		pfree(to_insert);
+	compressState->ntuples = 0;
+	return true;
+}
+
+/*
+ * Before splitting the page, try to compress items to free some space.
+ * If compression didn't succeed, buffer will contain old state of the page.
+ * This function should be called after lp_dead items
+ * were removed by _bt_vacuum_one_page().
+ */
+static void
+_bt_compress_one_page(Relation rel, Buffer buffer, Relation heapRel)
+{
+	OffsetNumber offnum,
+				minoff,
+				maxoff;
+	Page		page = BufferGetPage(buffer);
+	Page		newpage;
+	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	bool use_compression = false;
+	BTCompressState *compressState = NULL;
+	int n_posting_on_page = 0;
+	int natts = IndexRelationGetNumberOfAttributes(rel);
+
+	/*
+	 * Don't use compression for indexes with INCLUDEd columns,
+	 * system indexes and unique indexes.
+	 */
+	use_compression = ((IndexRelationGetNumberOfKeyAttributes(rel) ==
+									  IndexRelationGetNumberOfAttributes(rel))
+									  && (!IsSystemRelation(rel))
+									  && (!rel->rd_index->indisunique));
+	if (!use_compression)
+		return;
+
+	/* init compress state needed to build posting tuples */
+	compressState = (BTCompressState *) palloc0(sizeof(BTCompressState));
+	compressState->ipd = NULL;
+	compressState->ntuples = 0;
+	compressState->itupprev = NULL;
+	compressState->maxitemsize = BTMaxItemSize(page);
+	compressState->maxpostingsize = 0;
+
+	/*
+	 * Scan over all items to see which ones can be compressed
+	 */
+	minoff = P_FIRSTDATAKEY(opaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	/*
+	 * Heuristic to avoid trying to compress page
+	 * that has already contain mostly compressed items
+	 */
+	for (offnum = minoff;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId	itemid = PageGetItemId(page, P_HIKEY);
+		IndexTuple item = (IndexTuple) PageGetItem(page, itemid);
+
+		if (BTreeTupleIsPosting(item))
+			n_posting_on_page++;
+	}
+	/*
+	 * If we have only 10 uncompressed items on the full page,
+	 * it probably won't worth to compress them.
+	 */
+	if (maxoff - n_posting_on_page < 10)
+		return;
+
+	newpage = PageGetTempPageCopySpecial(page);
+	elog(DEBUG4, "_bt_compress_one_page rel: %s,blkno: %u",
+		 RelationGetRelationName(rel), BufferGetBlockNumber(buffer));
+
+	/* Copy High Key if any */
+	if (!P_RIGHTMOST(opaque))
+	{
+		ItemId	itemid = PageGetItemId(page, P_HIKEY);
+		Size itemsz = ItemIdGetLength(itemid);
+		IndexTuple item = (IndexTuple) PageGetItem(page, itemid);
+
+		if (PageAddItem(newpage, (Item) item, itemsz, P_HIKEY,
+						false, false) == InvalidOffsetNumber)
+		{
+			/*
+			 * Should never happen. Anyway, fallback gently to scenario of
+			 * incompressible page and just return from function.
+			 */
+			elog(DEBUG4, "_bt_compress_one_page. failed to insert highkey to newpage");
+			return;
+		}
+	}
+
+	/* Iterate over tuples on the page, try to compress them into posting lists
+	 * and insert into new page.
+	 */
+	for (offnum = minoff;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemId = PageGetItemId(page, offnum);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemId);
+
+		/*
+		 * We do not expect to meet any DEAD items, since this
+		 * function is called right after _bt_vacuum_one_page().
+		 * If for some reason we found dead item, don't compress it,
+		 * to allow upcoming microvacuum or vacuum clean it up.
+		 */
+		if(ItemIdIsDead(itemId))
+			continue;
+
+		if (compressState->itupprev != NULL)
+		{
+			int n_equal_atts = _bt_keep_natts_fast(rel,
+														compressState->itupprev, itup);
+			int itup_ntuples = BTreeTupleIsPosting(itup)?BTreeTupleGetNPosting(itup):1;
+
+			if (n_equal_atts > natts)
+			{
+				/* Tuples are equal. Create or update posting. */
+				if (compressState->maxitemsize >
+					MAXALIGN(((IndexTupleSize(compressState->itupprev)
+							   + (compressState->ntuples + itup_ntuples+1)*sizeof(ItemPointerData)))))
+					add_item_to_posting(compressState, itup);
+				else
+					/* If posting is too big, insert it on page and continue.*/
+					if (!insert_itupprev_to_page(newpage, compressState))
+					{
+						elog(DEBUG4, "_bt_compress_one_page. failed to insert posting");
+						return;
+					}
+			}
+			else
+			{
+				/*
+				 * Tuples are not equal. Insert itupprev into index.
+				 * Save current tuple for the next iteration.
+				 */
+				if (!insert_itupprev_to_page(newpage, compressState))
+				{
+					elog(DEBUG4, "_bt_compress_one_page. failed to insert posting");
+					return;
+				}
+			}
+		}
+
+		/*
+		 * Copy the tuple into temp variable itupprev
+		 * to compare it with the following tuple
+		 * and maybe unite them into a posting tuple
+		 */
+		if (compressState->itupprev)
+			pfree(compressState->itupprev);
+		compressState->itupprev = CopyIndexTuple(itup);
+
+		Assert(IndexTupleSize(compressState->itupprev) <=  compressState->maxitemsize);
+	}
+
+	/* Handle the last item.*/
+	if (!insert_itupprev_to_page(newpage, compressState))
+	{
+		elog(DEBUG4, "_bt_compress_one_page. failed to insert posting for last item");
+		return;
+	}
+
+	START_CRIT_SECTION();
+	PageRestoreTempPage(newpage, page);
+	MarkBufferDirty(buffer);
+
+	/* Log full page write */
+	if (RelationNeedsWAL(rel))
+	{
+		XLogRecPtr recptr;
+		recptr = log_newpage_buffer(buffer, true);
+		PageSetLSN(page, recptr);
+	}
+	END_CRIT_SECTION();
+
+	elog(DEBUG4, "_bt_compress_one_page. success");
+	return;
+}
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index de4d4ef..681077f 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -1024,14 +1024,54 @@ _bt_page_recyclable(Page page)
 void
 _bt_delitems_vacuum(Relation rel, Buffer buf,
 					OffsetNumber *itemnos, int nitems,
+					OffsetNumber *remainingoffset,
+					IndexTuple *remaining, int nremaining,
 					BlockNumber lastBlockVacuumed)
 {
 	Page		page = BufferGetPage(buf);
 	BTPageOpaque opaque;
+	int		i;
+	Size	itemsz;
+	Size	remaining_sz = 0;
+	char   *remaining_buf = NULL;
+
+	/* XLOG stuff, buffer for remainings */
+	if (nremaining && RelationNeedsWAL(rel))
+	{
+		Size offset = 0;
+
+		for (i = 0; i < nremaining; i++)
+			remaining_sz += MAXALIGN(IndexTupleSize(remaining[i]));
+
+		remaining_buf = palloc0(remaining_sz);
+		for (i = 0; i < nremaining; i++)
+		{
+			itemsz = IndexTupleSize(remaining[i]);
+			memcpy(remaining_buf + offset, (char *) remaining[i], itemsz);
+			offset += MAXALIGN(itemsz);
+		}
+		Assert(offset == remaining_sz);
+	}
 
 	/* No ereport(ERROR) until changes are logged */
 	START_CRIT_SECTION();
 
+	/* Handle posting tuples here */
+	for (i = 0; i < nremaining; i++)
+	{
+		/* At first, delete the old tuple.*/
+		PageIndexTupleDelete(page, remainingoffset[i]);
+
+		itemsz = IndexTupleSize(remaining[i]);
+		itemsz = MAXALIGN(itemsz);
+
+		/* Add tuple with remaining ItemPointers to the page.*/
+		if (PageAddItem(page, (Item) remaining[i], itemsz, remainingoffset[i],
+							false, false) == InvalidOffsetNumber)
+			elog(ERROR, "failed to rewrite compressed item in index while doing vacuum");
+	}
+
+
 	/* Fix the page */
 	if (nitems > 0)
 		PageIndexMultiDelete(page, itemnos, nitems);
@@ -1061,6 +1101,9 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
 		xl_btree_vacuum xlrec_vacuum;
 
 		xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
+		xlrec_vacuum.nremaining = nremaining;
+		xlrec_vacuum.ndeleted = nitems;
+
 
 		XLogBeginInsert();
 		XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
@@ -1074,6 +1117,20 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
 		if (nitems > 0)
 			XLogRegisterBufData(0, (char *) itemnos, nitems * sizeof(OffsetNumber));
 
+		/*
+		 * Here we should save offnums and remaining tuples themselves.
+		 * It's important to restore them in correct order.
+		 * At first, we must handle remaining tuples and only after that
+		 * other deleted items.
+		 */
+		if (nremaining > 0)
+		{
+			Assert(remaining_buf != NULL);
+			XLogRegisterBufData(0, (char *) remainingoffset,
+								nremaining * sizeof(OffsetNumber));
+			XLogRegisterBufData(0, remaining_buf, remaining_sz);
+		}
+
 		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
 
 		PageSetLSN(page, recptr);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 85e54ac..5a7d7bd 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -97,8 +97,8 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 						 BTCycleId cycleid, TransactionId *oldestBtpoXact);
 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
 						 BlockNumber orig_blkno);
-
-
+static ItemPointer btreevacuumPosting(BTVacState *vstate,
+									  IndexTuple itup, int *nremaining);
 /*
  * Btree handler function: return IndexAmRoutine with access method parameters
  * and callbacks.
@@ -1069,7 +1069,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 								 RBM_NORMAL, info->strategy);
 		LockBufferForCleanup(buf);
 		_bt_checkpage(rel, buf);
-		_bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
+		_bt_delitems_vacuum(rel, buf, NULL, 0, NULL, NULL, 0, vstate.lastBlockVacuumed);
 		_bt_relbuf(rel, buf);
 	}
 
@@ -1193,6 +1193,9 @@ restart:
 		OffsetNumber offnum,
 					minoff,
 					maxoff;
+		IndexTuple	remaining[MaxOffsetNumber];
+		OffsetNumber remainingoffset[MaxOffsetNumber];
+		int			nremaining;
 
 		/*
 		 * Trade in the initial read lock for a super-exclusive write lock on
@@ -1229,6 +1232,7 @@ restart:
 		 * callback function.
 		 */
 		ndeletable = 0;
+		nremaining = 0;
 		minoff = P_FIRSTDATAKEY(opaque);
 		maxoff = PageGetMaxOffsetNumber(page);
 		if (callback)
@@ -1242,31 +1246,77 @@ restart:
 
 				itup = (IndexTuple) PageGetItem(page,
 												PageGetItemId(page, offnum));
-				htup = &(itup->t_tid);
 
-				/*
-				 * During Hot Standby we currently assume that
-				 * XLOG_BTREE_VACUUM records do not produce conflicts. That is
-				 * only true as long as the callback function depends only
-				 * upon whether the index tuple refers to heap tuples removed
-				 * in the initial heap scan. When vacuum starts it derives a
-				 * value of OldestXmin. Backends taking later snapshots could
-				 * have a RecentGlobalXmin with a later xid than the vacuum's
-				 * OldestXmin, so it is possible that row versions deleted
-				 * after OldestXmin could be marked as killed by other
-				 * backends. The callback function *could* look at the index
-				 * tuple state in isolation and decide to delete the index
-				 * tuple, though currently it does not. If it ever did, we
-				 * would need to reconsider whether XLOG_BTREE_VACUUM records
-				 * should cause conflicts. If they did cause conflicts they
-				 * would be fairly harsh conflicts, since we haven't yet
-				 * worked out a way to pass a useful value for
-				 * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
-				 * applies to *any* type of index that marks index tuples as
-				 * killed.
-				 */
-				if (callback(htup, callback_state))
-					deletable[ndeletable++] = offnum;
+				if (BTreeTupleIsPosting(itup))
+				{
+					int 		nnewipd = 0;
+					ItemPointer newipd = NULL;
+
+					newipd = btreevacuumPosting(vstate, itup, &nnewipd);
+
+					if (nnewipd == 0)
+					{
+						/*
+						 * All TIDs from posting list must be deleted,
+						 * we can delete whole tuple in a regular way.
+						 */
+						deletable[ndeletable++] = offnum;
+					}
+					else if (nnewipd == BTreeTupleGetNPosting(itup))
+					{
+						/*
+						 * All TIDs from posting tuple must remain.
+						 * Do nothing, just cleanup.
+						 */
+						pfree(newipd);
+					}
+					else if (nnewipd < BTreeTupleGetNPosting(itup))
+					{
+						/* Some TIDs from posting tuple must remain. */
+						Assert(nnewipd > 0);
+						Assert(newipd != NULL);
+
+						/*
+						 * Form new tuple that contains only remaining TIDs.
+						 * Remember this tuple and the offset of the old tuple
+						 * to update it in place.
+						 */
+						remainingoffset[nremaining] = offnum;
+						remaining[nremaining] = BTreeFormPostingTuple(itup, newipd, nnewipd);
+						nremaining++;
+						pfree(newipd);
+
+						Assert(IndexTupleSize(itup) <= BTMaxItemSize(page));
+					}
+				}
+				else
+				{
+					htup = &(itup->t_tid);
+
+					/*
+					* During Hot Standby we currently assume that
+					* XLOG_BTREE_VACUUM records do not produce conflicts. That is
+					* only true as long as the callback function depends only
+					* upon whether the index tuple refers to heap tuples removed
+					* in the initial heap scan. When vacuum starts it derives a
+					* value of OldestXmin. Backends taking later snapshots could
+					* have a RecentGlobalXmin with a later xid than the vacuum's
+					* OldestXmin, so it is possible that row versions deleted
+					* after OldestXmin could be marked as killed by other
+					* backends. The callback function *could* look at the index
+					* tuple state in isolation and decide to delete the index
+					* tuple, though currently it does not. If it ever did, we
+					* would need to reconsider whether XLOG_BTREE_VACUUM records
+					* should cause conflicts. If they did cause conflicts they
+					* would be fairly harsh conflicts, since we haven't yet
+					* worked out a way to pass a useful value for
+					* latestRemovedXid on the XLOG_BTREE_VACUUM records. This
+					* applies to *any* type of index that marks index tuples as
+					* killed.
+					*/
+					if (callback(htup, callback_state))
+						deletable[ndeletable++] = offnum;
+				}
 			}
 		}
 
@@ -1274,7 +1324,7 @@ restart:
 		 * Apply any needed deletes.  We issue just one _bt_delitems_vacuum()
 		 * call per page, so as to minimize WAL traffic.
 		 */
-		if (ndeletable > 0)
+		if (ndeletable > 0 || nremaining > 0)
 		{
 			/*
 			 * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
@@ -1291,6 +1341,7 @@ restart:
 			 * that.
 			 */
 			_bt_delitems_vacuum(rel, buf, deletable, ndeletable,
+								remainingoffset, remaining, nremaining,
 								vstate->lastBlockVacuumed);
 
 			/*
@@ -1376,6 +1427,43 @@ restart:
 }
 
 /*
+ * btreevacuumPosting() -- vacuums a posting tuple.
+ *
+ * Returns new palloc'd posting list with remaining items.
+ * Posting list size is returned via nremaining.
+ *
+ * If all items are dead,
+ * nremaining is 0 and resulting posting list is NULL.
+ */
+static ItemPointer
+btreevacuumPosting(BTVacState *vstate, IndexTuple itup, int *nremaining)
+{
+	int			i,
+				remaining	= 0;
+	int			nitem		= BTreeTupleGetNPosting(itup);
+	ItemPointer tmpitems	= NULL,
+				items		= BTreeTupleGetPosting(itup);
+
+	/*
+	 * Check each tuple in the posting list,
+	 * save alive tuples into tmpitems
+	 */
+	for (i = 0; i < nitem; i++)
+	{
+		if (vstate->callback(items + i, vstate->callback_state))
+			continue;
+
+		if (tmpitems == NULL)
+			tmpitems = palloc(sizeof(ItemPointerData) * nitem);
+
+		tmpitems[remaining++] = items[i];
+	}
+
+	*nremaining = remaining;
+	return tmpitems;
+}
+
+/*
  *	btcanreturn() -- Check whether btree indexes support index-only scans.
  *
  * btrees always do, so this is trivial.
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index c655dad..594936d 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -30,6 +30,8 @@ static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
 						 OffsetNumber offnum);
 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
 						 OffsetNumber offnum, IndexTuple itup);
+static void _bt_savePostingitem(BTScanOpaque so, int itemIndex,
+			OffsetNumber offnum, ItemPointer iptr, IndexTuple itup, int i);
 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
@@ -1410,6 +1412,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	int			itemIndex;
 	bool		continuescan;
 	int			indnatts;
+	int 		i;
 
 	/*
 	 * We must have the buffer pinned and locked, but the usual macro can't be
@@ -1456,6 +1459,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 	/* initialize tuple workspace to empty */
 	so->currPos.nextTupleOffset = 0;
+	so->currPos.prevTupleOffset = 0;
 
 	/*
 	 * Now that the current page has been made consistent, the macro should be
@@ -1490,8 +1494,22 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
 			{
 				/* tuple passes all scan key conditions, so remember it */
-				_bt_saveitem(so, itemIndex, offnum, itup);
-				itemIndex++;
+				if (BTreeTupleIsPosting(itup))
+				{
+					for (i = 0; i < BTreeTupleGetNPosting(itup); i++)
+					{
+						_bt_savePostingitem(so, itemIndex, offnum,
+											BTreeTupleGetPostingN(itup, i),
+											itup, i);
+						itemIndex++;
+					}
+				}
+				else
+				{
+					_bt_saveitem(so, itemIndex, offnum, itup);
+					itemIndex++;
+				}
+
 			}
 			/* When !continuescan, there can't be any more matches, so stop */
 			if (!continuescan)
@@ -1524,7 +1542,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 		if (!continuescan)
 			so->currPos.moreRight = false;
 
-		Assert(itemIndex <= MaxIndexTuplesPerPage);
+		Assert(itemIndex <= MaxPostingIndexTuplesPerPage);
 		so->currPos.firstItem = 0;
 		so->currPos.lastItem = itemIndex - 1;
 		so->currPos.itemIndex = 0;
@@ -1532,7 +1550,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	else
 	{
 		/* load items[] in descending order */
-		itemIndex = MaxIndexTuplesPerPage;
+		itemIndex = MaxPostingIndexTuplesPerPage;
 
 		offnum = Min(offnum, maxoff);
 
@@ -1574,8 +1592,22 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			if (passes_quals && tuple_alive)
 			{
 				/* tuple passes all scan key conditions, so remember it */
-				itemIndex--;
-				_bt_saveitem(so, itemIndex, offnum, itup);
+				if (BTreeTupleIsPosting(itup))
+				{
+					for (i = 0; i < BTreeTupleGetNPosting(itup); i++)
+					{
+						itemIndex--;
+						_bt_savePostingitem(so, itemIndex, offnum,
+											BTreeTupleGetPostingN(itup, i),
+											itup, i);
+					}
+				}
+				else
+				{
+					itemIndex--;
+					_bt_saveitem(so, itemIndex, offnum, itup);
+				}
+
 			}
 			if (!continuescan)
 			{
@@ -1589,8 +1621,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 		Assert(itemIndex >= 0);
 		so->currPos.firstItem = itemIndex;
-		so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
-		so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
+		so->currPos.lastItem = MaxPostingIndexTuplesPerPage - 1;
+		so->currPos.itemIndex = MaxPostingIndexTuplesPerPage - 1;
 	}
 
 	return (so->currPos.firstItem <= so->currPos.lastItem);
@@ -1603,6 +1635,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
 {
 	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
 
+	Assert(!BTreeTupleIsPosting(itup));
+
 	currItem->heapTid = itup->t_tid;
 	currItem->indexOffset = offnum;
 	if (so->currTuples)
@@ -1615,6 +1649,34 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
 	}
 }
 
+/* Save an index item into so->currPos.items[itemIndex] for posting tuples. */
+static void
+_bt_savePostingitem(BTScanOpaque so, int itemIndex,
+			 OffsetNumber offnum, ItemPointer iptr, IndexTuple itup, int i)
+{
+	BTScanPosItem *currItem =  &so->currPos.items[itemIndex];
+
+	currItem->heapTid = *iptr;
+	currItem->indexOffset = offnum;
+
+	if (so->currTuples)
+	{
+		if (i == 0)
+		{
+			/* save key. the same for all tuples in the posting */
+			Size itupsz = BTreeTupleGetPostingOffset(itup);
+
+			currItem->tupleOffset = so->currPos.nextTupleOffset;
+			memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
+			so->currPos.nextTupleOffset += MAXALIGN(itupsz);
+			so->currPos.prevTupleOffset = currItem->tupleOffset;
+		}
+		else
+			currItem->tupleOffset = so->currPos.prevTupleOffset;
+	}
+}
+
+
 /*
  *	_bt_steppage() -- Step to next page containing valid data for scan
  *
@@ -2221,6 +2283,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 
 	/* OK, itemIndex says what to return */
 	currItem = &so->currPos.items[so->currPos.itemIndex];
+
 	scan->xs_heaptid = currItem->heapTid;
 	if (scan->xs_want_itup)
 		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index d0b9013..59f702b 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -65,6 +65,7 @@
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
+#include "catalog/catalog.h"
 #include "catalog/index.h"
 #include "commands/progress.h"
 #include "miscadmin.h"
@@ -76,6 +77,7 @@
 #include "utils/tuplesort.h"
 
 
+
 /* Magic numbers for parallel state sharing */
 #define PARALLEL_KEY_BTREE_SHARED		UINT64CONST(0xA000000000000001)
 #define PARALLEL_KEY_TUPLESORT			UINT64CONST(0xA000000000000002)
@@ -288,6 +290,8 @@ static void _bt_sortaddtup(Page page, Size itemsize,
 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
 						 IndexTuple itup);
 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
+static void insert_itupprev_to_page_buildadd(BTWriteState *wstate,
+						BTPageState *state, BTCompressState *compressState);
 static void _bt_load(BTWriteState *wstate,
 					 BTSpool *btspool, BTSpool *btspool2);
 static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
@@ -972,6 +976,11 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 			 * only shift the line pointer array back and forth, and overwrite
 			 * the tuple space previously occupied by oitup.  This is fairly
 			 * cheap.
+			 *
+			 * If lastleft tuple was a posting tuple,
+			 * we'll truncate its posting list in _bt_truncate as well.
+			 * Note that it is also applicable only to leaf pages,
+			 * since internal pages never contain posting tuples.
 			 */
 			ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
 			lastleft = (IndexTuple) PageGetItem(opage, ii);
@@ -1011,6 +1020,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 		 * the minimum key for the new page.
 		 */
 		state->btps_minkey = CopyIndexTuple(oitup);
+		Assert(!BTreeTupleIsPosting(state->btps_minkey));
 
 		/*
 		 * Set the sibling links for both pages.
@@ -1050,8 +1060,35 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 	if (last_off == P_HIKEY)
 	{
 		Assert(state->btps_minkey == NULL);
-		state->btps_minkey = CopyIndexTuple(itup);
-		/* _bt_sortaddtup() will perform full truncation later */
+
+		/*
+		 * Stashed copy must be a non-posting tuple,
+		 * with truncated posting list and correct t_tid
+		 * since we're going to use it to build downlink.
+		 */
+		if (BTreeTupleIsPosting(itup))
+		{
+			Size		keytupsz;
+			IndexTuple  keytup;
+
+			/*
+			 * Form key tuple, that doesn't contain any ipd.
+			 * NOTE: since we'll need TID later, set t_tid to
+			 * the first t_tid from posting list.
+			 */
+			keytupsz =  BTreeTupleGetPostingOffset(itup);
+			keytup = palloc0(keytupsz);
+			memcpy(keytup, itup, keytupsz);
+
+			keytup->t_info &= ~INDEX_SIZE_MASK;
+			keytup->t_info |= keytupsz;
+			ItemPointerCopy(BTreeTupleGetPosting(itup), &keytup->t_tid);
+			state->btps_minkey = CopyIndexTuple(keytup);
+			pfree(keytup);
+		}
+		else
+			state->btps_minkey = CopyIndexTuple(itup);		/* _bt_sortaddtup() will perform full truncation later */
+
 		BTreeTupleSetNAtts(state->btps_minkey, 0);
 	}
 
@@ -1137,6 +1174,87 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 }
 
 /*
+ * Add new tuple (posting or non-posting) to the page, while building index.
+ */
+void
+insert_itupprev_to_page_buildadd(BTWriteState *wstate, BTPageState *state,
+								 BTCompressState *compressState)
+{
+	IndexTuple to_insert;
+
+	/* Return, if there is no tuple to insert */
+	if (state == NULL)
+		return;
+
+	if (compressState->ntuples == 0)
+		to_insert = compressState->itupprev;
+	else
+	{
+		IndexTuple postingtuple;
+		/* form a tuple with a posting list */
+		postingtuple = BTreeFormPostingTuple(compressState->itupprev,
+											 compressState->ipd,
+											 compressState->ntuples);
+		to_insert = postingtuple;
+		pfree(compressState->ipd);
+	}
+
+	_bt_buildadd(wstate, state, to_insert);
+
+	if (compressState->ntuples > 0)
+		pfree(to_insert);
+	compressState->ntuples = 0;
+}
+
+/*
+ * Save item pointer(s) of itup to the posting list in compressState.
+ * Helper function for bt_load() and _bt_compress_one_page().
+ *
+ * Note: caller is responsible for size check to ensure that
+ * resulting tuple won't exceed BTMaxItemSize.
+ */
+void
+add_item_to_posting(BTCompressState *compressState, IndexTuple itup)
+{
+	int nposting = 0;
+
+	if (compressState->ntuples == 0)
+	{
+		compressState->ipd = palloc0(compressState->maxitemsize);
+
+		if (BTreeTupleIsPosting(compressState->itupprev))
+		{
+			/* if itupprev is posting, add all its TIDs to the posting list */
+			nposting = BTreeTupleGetNPosting(compressState->itupprev);
+			memcpy(compressState->ipd, BTreeTupleGetPosting(compressState->itupprev),
+				   sizeof(ItemPointerData)*nposting);
+			compressState->ntuples += nposting;
+		}
+		else
+		{
+			memcpy(compressState->ipd, compressState->itupprev,
+				   sizeof(ItemPointerData));
+			compressState->ntuples++;
+		}
+	}
+
+	if (BTreeTupleIsPosting(itup))
+	{
+		/* if tuple is posting, add all its TIDs to the posting list */
+		nposting = BTreeTupleGetNPosting(itup);
+		memcpy(compressState->ipd + compressState->ntuples,
+			   BTreeTupleGetPosting(itup), sizeof(ItemPointerData)*nposting);
+		compressState->ntuples += nposting;
+	}
+	else
+	{
+		memcpy(compressState->ipd + compressState->ntuples, itup,
+			   sizeof(ItemPointerData));
+		compressState->ntuples++;
+	}
+}
+
+/*
  * Read tuples in correct sort order from tuplesort, and load them into
  * btree leaves.
  */
@@ -1150,9 +1268,21 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	bool		load1;
 	TupleDesc	tupdes = RelationGetDescr(wstate->index);
 	int			i,
-				keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
+				keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index),
+				natts = IndexRelationGetNumberOfAttributes(wstate->index);
 	SortSupport sortKeys;
 	int64		tuples_done = 0;
+	bool		use_compression = false;
+	BTCompressState *compressState = NULL;
+
+	/*
+	 * Don't use compression for indexes with INCLUDEd columns,
+	 * system indexes and unique indexes.
+	 */
+	use_compression = ((IndexRelationGetNumberOfKeyAttributes(wstate->index) ==
+									  IndexRelationGetNumberOfAttributes(wstate->index))
+									  && (!IsSystemRelation(wstate->index))
+									  && (!wstate->index->rd_index->indisunique));
 
 	if (merge)
 	{
@@ -1266,19 +1396,83 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	}
 	else
 	{
-		/* merge is unnecessary */
-		while ((itup = tuplesort_getindextuple(btspool->sortstate,
-											   true)) != NULL)
+		if (!use_compression)
 		{
-			/* When we see first tuple, create first index page */
-			if (state == NULL)
-				state = _bt_pagestate(wstate, 0);
+			/* merge is unnecessary */
+			while ((itup = tuplesort_getindextuple(btspool->sortstate,
+												   true)) != NULL)
+			{
+				/* When we see first tuple, create first index page */
+				if (state == NULL)
+					state = _bt_pagestate(wstate, 0);
 
-			_bt_buildadd(wstate, state, itup);
+				_bt_buildadd(wstate, state, itup);
 
-			/* Report progress */
-			pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
-										 ++tuples_done);
+				/* Report progress */
+				pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
+											++tuples_done);
+			}
+		}
+		else
+		{
+			/* init compress state needed to build posting tuples */
+			compressState = (BTCompressState *) palloc0(sizeof(BTCompressState));
+			compressState->ipd = NULL;
+			compressState->ntuples = 0;
+			compressState->itupprev = NULL;
+			compressState->maxitemsize = 0;
+			compressState->maxpostingsize = 0;
+	
+			while ((itup = tuplesort_getindextuple(btspool->sortstate,
+												   true)) != NULL)
+			{
+				/* When we see first tuple, create first index page */
+				if (state == NULL)
+				{
+					state = _bt_pagestate(wstate, 0);
+					compressState->maxitemsize = BTMaxItemSize(state->btps_page);
+				}
+
+				if (compressState->itupprev != NULL)
+				{
+					int n_equal_atts = _bt_keep_natts_fast(wstate->index,
+														   compressState->itupprev, itup);
+
+					if (n_equal_atts > natts)
+					{
+						/* Tuples are equal. Create or update posting. */
+						if ((compressState->ntuples+1)*sizeof(ItemPointerData) < compressState->maxpostingsize)
+							add_item_to_posting(compressState, itup);
+						else
+							/* If posting is too big, insert it on page and continue.*/
+							insert_itupprev_to_page_buildadd(wstate, state, compressState);
+					}
+					else
+					{
+						/*
+						 * Tuples are not equal. Insert itupprev into index.
+						 * Save current tuple for the next iteration.
+						 */
+						insert_itupprev_to_page_buildadd(wstate, state, compressState);
+					}
+				}
+
+				/*
+				 * Save the tuple to compare it with the next one
+				 * and maybe unite them into a posting tuple.
+				 */
+				if (compressState->itupprev)
+					pfree(compressState->itupprev);
+				compressState->itupprev = CopyIndexTuple(itup);
+
+				/* compute max size of posting list */
+				 compressState->maxpostingsize = compressState->maxitemsize -
+								IndexInfoFindDataOffset(compressState->itupprev->t_info) -
+								MAXALIGN(IndexTupleSize(compressState->itupprev));
+			}
+
+			/* Handle the last item */
+			insert_itupprev_to_page_buildadd(wstate, state, compressState);
 		}
 	}
 
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 93fab26..8b77b69 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1787,7 +1787,9 @@ _bt_killitems(IndexScanDesc scan)
 			ItemId		iid = PageGetItemId(page, offnum);
 			IndexTuple	ituple = (IndexTuple) PageGetItem(page, iid);
 
-			if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
+			/* No microvacuum for posting tuples */
+			if (!BTreeTupleIsPosting(ituple) &&
+				(ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)))
 			{
 				/* found the item */
 				ItemIdMarkDead(iid);
@@ -2145,6 +2147,16 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 
 		pivot = index_truncate_tuple(itupdesc, firstright, keepnatts);
 
+		if (BTreeTupleIsPosting(firstright))
+		{
+			BTreeTupleClearBtIsPosting(pivot);
+			BTreeTupleSetNAtts(pivot, keepnatts);
+			pivot->t_info &= ~INDEX_SIZE_MASK;
+			pivot->t_info |= BTreeTupleGetPostingOffset(firstright);
+		}
+
+		Assert(!BTreeTupleIsPosting(pivot));
+
 		/*
 		 * If there is a distinguishing key attribute within new pivot tuple,
 		 * there is no need to add an explicit heap TID attribute
@@ -2168,6 +2180,26 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 		pfree(pivot);
 		pivot = tidpivot;
 	}
+	else if (BTreeTupleIsPosting(firstright))
+	{
+		/*
+		 * No truncation was possible, since key attributes are all equal.
+		 * But the tuple is a compressed tuple with a posting list,
+		 * so we still must truncate it.
+		 * 
+		 * It's necessary to add a heap TID attribute to the new pivot tuple.
+		 */
+		newsize = BTreeTupleGetPostingOffset(firstright) + MAXALIGN(sizeof(ItemPointerData));
+		pivot = palloc0(newsize);
+		memcpy(pivot, firstright, BTreeTupleGetPostingOffset(firstright));
+
+		pivot->t_info &= ~INDEX_SIZE_MASK;
+		pivot->t_info |= newsize;
+		BTreeTupleClearBtIsPosting(pivot);
+		BTreeTupleSetAltHeapTID(pivot);
+	
+		Assert(!BTreeTupleIsPosting(pivot));
+	}
 	else
 	{
 		/*
@@ -2205,7 +2237,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 */
 	pivotheaptid = (ItemPointer) ((char *) pivot + newsize -
 								  sizeof(ItemPointerData));
-	ItemPointerCopy(&lastleft->t_tid, pivotheaptid);
+	ItemPointerCopy(BTreeTupleGetMaxTID(lastleft), pivotheaptid);
 
 	/*
 	 * Lehman and Yao require that the downlink to the right page, which is to
@@ -2216,9 +2248,9 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 * tiebreaker.
 	 */
 #ifndef DEBUG_NO_TRUNCATE
-	Assert(ItemPointerCompare(&lastleft->t_tid, &firstright->t_tid) < 0);
-	Assert(ItemPointerCompare(pivotheaptid, &lastleft->t_tid) >= 0);
-	Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0);
+	Assert(ItemPointerCompare(BTreeTupleGetMaxTID(lastleft), BTreeTupleGetMinTID(firstright)) < 0);
+	Assert(ItemPointerCompare(pivotheaptid, BTreeTupleGetMinTID(lastleft)) >= 0);
+	Assert(ItemPointerCompare(pivotheaptid, BTreeTupleGetMinTID(firstright)) < 0);
 #else
 
 	/*
@@ -2231,7 +2263,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 * attribute values along with lastleft's heap TID value when lastleft's
 	 * TID happens to be greater than firstright's TID.
 	 */
-	ItemPointerCopy(&firstright->t_tid, pivotheaptid);
+	ItemPointerCopy(BTreeTupleGetMinTID(firstright), pivotheaptid);
 
 	/*
 	 * Pivot heap TID should never be fully equal to firstright.  Note that
@@ -2240,7 +2272,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 */
 	ItemPointerSetOffsetNumber(pivotheaptid,
 							   OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
-	Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0);
+	Assert(ItemPointerCompare(pivotheaptid, BTreeTupleGetMinTID(firstright)) < 0);
 #endif
 
 	BTreeTupleSetNAtts(pivot, nkeyatts);
@@ -2330,6 +2362,10 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
  * leaving excessive amounts of free space on either side of page split.
  * Callers can rely on the fact that attributes considered equal here are
  * definitely also equal according to _bt_keep_natts.
+ *
+ * To build a posting tuple we need to ensure that all attributes
+ * of both tuples are equal. Use this function to compare them.
+ * TODO: maybe it's worth to rename the function.
  */
 int
 _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
@@ -2415,7 +2451,7 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 			 * Non-pivot tuples currently never use alternative heap TID
 			 * representation -- even those within heapkeyspace indexes
 			 */
-			if ((itup->t_info & INDEX_ALT_TID_MASK) != 0)
+			if (BTreeTupleIsPivot(itup))
 				return false;
 
 			/*
@@ -2470,7 +2506,7 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 			 * that to decide if the tuple is a pre-v11 tuple.
 			 */
 			return tupnatts == 0 ||
-				((itup->t_info & INDEX_ALT_TID_MASK) == 0 &&
+				(!BTreeTupleIsPivot(itup) &&
 				 ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
 		}
 		else
@@ -2497,7 +2533,7 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 	 * heapkeyspace index pivot tuples, regardless of whether or not there are
 	 * non-key attributes.
 	 */
-	if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+	if (!BTreeTupleIsPivot(itup))
 		return false;
 
 	/*
@@ -2549,6 +2585,7 @@ _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
 	if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
 		return;
 
+	/* TODO correct error messages for posting tuples */
 	/*
 	 * Internal page insertions cannot fail here, because that would mean that
 	 * an earlier leaf level insertion that should have failed didn't
@@ -2575,3 +2612,59 @@ _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
 					 "or use full text indexing."),
 			 errtableconstraint(heap, RelationGetRelationName(rel))));
 }
+
+/*
+ * Given a basic tuple that contains key datum and posting list,
+ * build a posting tuple.
+ *
+ * Basic tuple can be a posting tuple, but we only use key part of it,
+ * all ItemPointers must be passed via ipd.
+ *
+ * If nipd == 1 fallback to building a non-posting tuple.
+ * It is necessary to avoid storage overhead after posting tuple was vacuumed.
+ */
+IndexTuple
+BTreeFormPostingTuple(IndexTuple tuple, ItemPointerData *ipd, int nipd)
+{
+	uint32	   keysize, newsize;
+	IndexTuple itup;
+
+	/* We only need key part of the tuple */
+	if (BTreeTupleIsPosting(tuple))
+		keysize = BTreeTupleGetPostingOffset(tuple);
+	else
+		keysize = IndexTupleSize(tuple);
+
+	Assert (nipd > 0);
+
+	/* Add space needed for posting list */
+	if (nipd > 1)
+		newsize = SHORTALIGN(keysize) + sizeof(ItemPointerData) * nipd;
+	
+	newsize = MAXALIGN(newsize);
+	itup = palloc0(newsize);
+	memcpy(itup, tuple, keysize);
+	itup->t_info &= ~INDEX_SIZE_MASK;
+	itup->t_info |= newsize;
+
+
+	if (nipd > 1)
+	{
+		/* Form posting tuple, fill posting fields */
+
+		/* Set meta info about the posting list */
+		itup->t_info |= INDEX_ALT_TID_MASK;
+		BTreeSetPostingMeta(itup, nipd, SHORTALIGN(keysize));
+
+		/* Copy posting list into the posting tuple */
+		memcpy(BTreeTupleGetPosting(itup), ipd,
+			   sizeof(ItemPointerData) * nipd);
+	}
+	else
+	{
+		/* To finish building of a non-posting tuple, copy TID from ipd */
+		ItemPointerCopy(ipd, &itup->t_tid);
+	}
+
+	return itup;
+}
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 6532a25..16224b4 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -384,8 +384,8 @@ btree_xlog_vacuum(XLogReaderState *record)
 	Buffer		buffer;
 	Page		page;
 	BTPageOpaque opaque;
-#ifdef UNUSED
 	xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
+#ifdef UNUSED
 
 	/*
 	 * This section of code is thought to be no longer needed, after analysis
@@ -476,14 +476,36 @@ btree_xlog_vacuum(XLogReaderState *record)
 
 		if (len > 0)
 		{
-			OffsetNumber *unused;
-			OffsetNumber *unend;
+			if (xlrec->nremaining)
+			{
+				int				i;
+				OffsetNumber   *remainingoffset;
+				IndexTuple		remaining;
+				Size			itemsz;
+
+				remainingoffset = (OffsetNumber *)
+					(ptr + xlrec->ndeleted * sizeof(OffsetNumber));
+				remaining = (IndexTuple) ((char *) remainingoffset +
+						xlrec->nremaining * sizeof(OffsetNumber));
 
-			unused = (OffsetNumber *) ptr;
-			unend = (OffsetNumber *) ((char *) ptr + len);
+				/* Handle posting tuples */
+				for (i = 0; i < xlrec->nremaining; i++)
+				{
+					PageIndexTupleDelete(page, remainingoffset[i]);
+
+					itemsz = MAXALIGN(IndexTupleSize(remaining));
+
+					if (PageAddItem(page, (Item) remaining, itemsz, remainingoffset[i],
+							false, false) == InvalidOffsetNumber)
+							elog(PANIC, "btree_xlog_vacuum: failed to add remaining item");
+
+					remaining = (IndexTuple)((char*) remaining + itemsz);
+				}
+			}
 
-			if ((unend - unused) > 0)
-				PageIndexMultiDelete(page, unused, unend - unused);
+			
+			if (xlrec->ndeleted)
+				PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
 		}
 
 		/*
diff --git a/src/include/access/itup.h b/src/include/access/itup.h
index 744ffb6..85ee040 100644
--- a/src/include/access/itup.h
+++ b/src/include/access/itup.h
@@ -141,6 +141,11 @@ typedef IndexAttributeBitMapData * IndexAttributeBitMap;
  * On such a page, N tuples could take one MAXALIGN quantum less space than
  * estimated here, seemingly allowing one more tuple than estimated here.
  * But such a page always has at least MAXALIGN special space, so we're safe.
+ *
+ * Note: btree leaf pages may contain posting tuples, which store duplicates
+ * in a more effective way, so they may contain more tuples.
+ * Use MaxPostingIndexTuplesPerPage instead.
+
  */
 #define MaxIndexTuplesPerPage	\
 	((int) ((BLCKSZ - SizeOfPageHeaderData) / \
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index a3583f2..57ee21e 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -234,8 +234,7 @@ typedef struct BTMetaPageData
  *  t_tid | t_info | key values | INCLUDE columns, if any
  *
  * t_tid points to the heap TID, which is a tiebreaker key column as of
- * BTREE_VERSION 4.  Currently, the INDEX_ALT_TID_MASK status bit is never
- * set for non-pivot tuples.
+ * BTREE_VERSION 4.
  *
  * All other types of index tuples ("pivot" tuples) only have key columns,
  * since pivot tuples only exist to represent how the key space is
@@ -252,6 +251,39 @@ typedef struct BTMetaPageData
  * omitted rather than truncated, since its representation is different to
  * the non-pivot representation.)
  *
+ * Non-pivot posting tuple format:
+ *  t_tid | t_info | key values | INCLUDE columns, if any | posting_list[]
+ *
+ * In order to store duplicated keys more effectively,
+ * BTREE_VERSION 5 introduced new format of tuples - posting tuples.
+ * posting_list is an array of ItemPointerData.
+ *
+ * This type of compression never applies to system indexes, unique indexes
+ * or indexes with INCLUDEd columns.
+ *
+ * To differ posting tuples we use INDEX_ALT_TID_MASK flag in t_info and
+ * BT_IS_POSTING flag in t_tid.
+ * These flags redefine the content of the posting tuple's tid:
+ * - t_tid.ip_blkid contains offset of the posting list.
+ * - t_tid offset field contains number of posting items this tuple contain
+ *
+ * The 12 least significant offset bits from t_tid are used to represent
+ * the number of posting items in posting tuples, leaving 4 status
+ * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
+ * future use.
+ * BT_N_POSTING_OFFSET_MASK is large enough to store any number of posting
+ * tuples, which is constrainted by BTMaxItemSize.
+
+ * If page contains so many duplicates, that they do not fit into one posting
+ * tuple (bounded by BTMaxItemSize and ), page may contain several posting
+ * tuples with the same key.
+ * Also page can contain both posting and non-posting tuples with the same key.
+ * Currently, posting tuples always contain at least two TIDs in the posting
+ * list.
+ *
+ * Posting tuples always have the same number of attributes as the index has
+ * generally.
+ *
  * Pivot tuple format:
  *
  *  t_tid | t_info | key values | [heap TID]
@@ -281,23 +313,149 @@ typedef struct BTMetaPageData
  * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
  * future use.  BT_N_KEYS_OFFSET_MASK should be large enough to store any
  * number of columns/attributes <= INDEX_MAX_KEYS.
+ * BT_IS_POSTING bit must be unset for pivot tuples, since we use it
+ * to distinct posting tuples from pivot tuples.
  *
  * Note well: The macros that deal with the number of attributes in tuples
- * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple,
+ * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple
+ * or non-pivot posting tuple,
  * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot
  * tuple (or must have the same number of attributes as the index has
- * generally in the case of !heapkeyspace indexes).  They will need to be
- * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK
- * for something else.
+ * generally in the case of !heapkeyspace indexes).
  */
 #define INDEX_ALT_TID_MASK			INDEX_AM_RESERVED_BIT
 
 /* Item pointer offset bits */
 #define BT_RESERVED_OFFSET_MASK		0xF000
 #define BT_N_KEYS_OFFSET_MASK		0x0FFF
+#define BT_N_POSTING_OFFSET_MASK	0x0FFF
 #define BT_HEAP_TID_ATTR			0x1000
+#define BT_IS_POSTING				0x2000
+
+#define BTreeTupleIsPosting(itup)  \
+	( \
+		((itup)->t_info & INDEX_ALT_TID_MASK && \
+		((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0))\
+	)
 
-/* Get/set downlink block number */
+#define BTreeTupleIsPivot(itup)  \
+	( \
+		((itup)->t_info & INDEX_ALT_TID_MASK && \
+		((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) == 0))\
+	)
+
+/*
+ * MaxPostingIndexTuplesPerPage is an upper bound on the number of tuples
+ * that can fit on one btree leaf page.
+ * 
+ * Btree leaf pages may contain posting tuples, which store duplicates
+ * in a more effective way, so MaxPostingIndexTuplesPerPage is larger then
+ * MaxIndexTuplesPerPage.
+ *
+ * Each leaf page must contain at least three items, so estimate it as
+ * if we have three posting tuples with minimal size keys.
+ */
+#define MaxPostingIndexTuplesPerPage \
+	((int) ((BLCKSZ - SizeOfPageHeaderData - \
+			3*((MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData))) )) / \
+			(sizeof(ItemPointerData)))
+
+/*
+ * Btree-private state needed to build posting tuples.
+ * ipd is a posting list - an array of ItemPointerData.
+ *
+ * Iterating over tuples during index build or applying compression to a
+ * single page, we remember a tuple in itupprev, then compare the next one
+ * with it. If tuples are equal, save their TIDs in the posting list.
+ * ntuples contains the size of the posting list.
+ *
+ * Use maxitemsize and maxpostingsize to ensure that resulting posting tuple
+ * will satisfy BTMaxItemSize.
+ */
+typedef struct BTCompressState
+{
+	Size			maxitemsize;
+	Size			maxpostingsize;
+	IndexTuple 		itupprev;
+	int 			ntuples;
+	ItemPointerData	*ipd;
+} BTCompressState;
+
+/* macros to work with posting tuples *BEGIN* */
+#define BTreeTupleSetBtIsPosting(itup) \
+	do { \
+		Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
+		Assert(!((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0)); \
+		ItemPointerSetOffsetNumber(&(itup)->t_tid, \
+			ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_IS_POSTING); \
+	} while(0)
+
+#define BTreeTupleClearBtIsPosting(itup) \
+	do { \
+		ItemPointerSetOffsetNumber(&(itup)->t_tid, \
+		ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & ~BT_IS_POSTING); \
+	} while(0)
+
+#define BTreeTupleGetNPosting(itup)	\
+	( \
+		ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_POSTING_OFFSET_MASK \
+	)
+
+#define BTreeTupleSetNPosting(itup, n) \
+	do { \
+		ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_POSTING_OFFSET_MASK); \
+		BTreeTupleSetBtIsPosting(itup); \
+	} while(0)
+
+/*
+ * If tuple is posting, t_tid.ip_blkid contains offset of the posting list.
+ * Caller is responsible for checking BTreeTupleIsPosting to ensure that
+ * he will get what he expects
+ */
+#define BTreeTupleGetPostingOffset(itup) \
+	ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
+#define BTreeTupleSetPostingOffset(itup, offset) \
+	ItemPointerSetBlockNumber(&((itup)->t_tid), (offset))
+
+#define BTreeSetPostingMeta(itup, nposting, off) \
+	do { \
+		BTreeTupleSetNPosting(itup, nposting); \
+		BTreeTupleSetPostingOffset(itup, off); \
+	} while(0)
+
+#define BTreeTupleGetPosting(itup) \
+	(ItemPointerData*) ((char*)(itup) + BTreeTupleGetPostingOffset(itup))
+#define BTreeTupleGetPostingN(itup,n) \
+	(ItemPointerData*) (BTreeTupleGetPosting(itup) + (n))
+
+/*
+ * Posting tuples always contain several TIDs.
+ * Some functions that use TID as a tiebreaker,
+ * to ensure correct order of TID keys they can use two macros below:
+ */
+#define BTreeTupleGetMinTID(itup) \
+	( \
+		((itup)->t_info & INDEX_ALT_TID_MASK && \
+		((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING))) ? \
+		( \
+			(ItemPointer) BTreeTupleGetPosting(itup) \
+		) \
+		: \
+		(ItemPointer) &((itup)->t_tid) \
+	)
+#define BTreeTupleGetMaxTID(itup) \
+	( \
+		((itup)->t_info & INDEX_ALT_TID_MASK && \
+		((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING))) ? \
+		( \
+			(ItemPointer) (BTreeTupleGetPosting(itup) + (BTreeTupleGetNPosting(itup)-1)) \
+		) \
+		: \
+		(ItemPointer) &((itup)->t_tid) \
+	)
+/* macros to work with posting tuples *END* */
+
+/* Get/set downlink block number  */
 #define BTreeInnerTupleGetDownLink(itup) \
 	ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
 #define BTreeInnerTupleSetDownLink(itup, blkno) \
@@ -326,15 +484,18 @@ typedef struct BTMetaPageData
  */
 #define BTreeTupleGetNAtts(itup, rel)	\
 	( \
-		(itup)->t_info & INDEX_ALT_TID_MASK ? \
+		((itup)->t_info & INDEX_ALT_TID_MASK && \
+		((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) == 0)) ? \
 		( \
 			ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \
 		) \
 		: \
 		IndexRelationGetNumberOfAttributes(rel) \
 	)
+
 #define BTreeTupleSetNAtts(itup, n) \
 	do { \
+		Assert(!BTreeTupleIsPosting(itup)); \
 		(itup)->t_info |= INDEX_ALT_TID_MASK; \
 		ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \
 	} while(0)
@@ -342,6 +503,8 @@ typedef struct BTMetaPageData
 /*
  * Get tiebreaker heap TID attribute, if any.  Macro works with both pivot
  * and non-pivot tuples, despite differences in how heap TID is represented.
+ *
+ * For non-pivot posting tuple it returns the first tid from posting list.
  */
 #define BTreeTupleGetHeapTID(itup) \
 	( \
@@ -351,7 +514,10 @@ typedef struct BTMetaPageData
 		(ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \
 					   sizeof(ItemPointerData)) \
 	  ) \
-	  : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \
+	  : (itup)->t_info & INDEX_ALT_TID_MASK ? \
+		(((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0) ? \
+		(ItemPointer) BTreeTupleGetPosting(itup) : NULL) \
+		: (ItemPointer) &((itup)->t_tid) \
 	)
 /*
  * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK
@@ -360,6 +526,7 @@ typedef struct BTMetaPageData
 #define BTreeTupleSetAltHeapTID(itup) \
 	do { \
 		Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
+		Assert(!((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0)); \
 		ItemPointerSetOffsetNumber(&(itup)->t_tid, \
 								   ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_HEAP_TID_ATTR); \
 	} while(0)
@@ -567,6 +734,8 @@ typedef struct BTScanPosData
 	 * location in the associated tuple storage workspace.
 	 */
 	int			nextTupleOffset;
+	/* prevTupleOffset is for posting list handling*/
+	int			prevTupleOffset;
 
 	/*
 	 * The items array is always ordered in index order (ie, increasing
@@ -579,7 +748,7 @@ typedef struct BTScanPosData
 	int			lastItem;		/* last valid index in items[] */
 	int			itemIndex;		/* current index in items[] */
 
-	BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
+	BTScanPosItem items[MaxPostingIndexTuplesPerPage]; /* MUST BE LAST */
 } BTScanPosData;
 
 typedef BTScanPosData *BTScanPos;
@@ -763,6 +932,8 @@ extern void _bt_delitems_delete(Relation rel, Buffer buf,
 								OffsetNumber *itemnos, int nitems, Relation heapRel);
 extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
 								OffsetNumber *itemnos, int nitems,
+								OffsetNumber *remainingoffset,
+								IndexTuple *remaining, int nremaining,
 								BlockNumber lastBlockVacuumed);
 extern int	_bt_pagedel(Relation rel, Buffer buf);
 
@@ -813,7 +984,8 @@ extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page,
 							OffsetNumber offnum);
 extern void _bt_check_third_page(Relation rel, Relation heap,
 								 bool needheaptidspace, Page page, IndexTuple newtup);
-
+extern IndexTuple BTreeFormPostingTuple(IndexTuple tuple,
+									   ItemPointerData *ipd, int nipd);
 /*
  * prototypes for functions in nbtvalidate.c
  */
@@ -825,5 +997,6 @@ extern bool btvalidate(Oid opclassoid);
 extern IndexBuildResult *btbuild(Relation heap, Relation index,
 								 struct IndexInfo *indexInfo);
 extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc);
-
+extern void add_item_to_posting(BTCompressState *compressState,
+								IndexTuple itup);
 #endif							/* NBTREE_H */
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index 9beccc8..c213bfa 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -172,11 +172,19 @@ typedef struct xl_btree_reuse_page
 typedef struct xl_btree_vacuum
 {
 	BlockNumber lastBlockVacuumed;
+	/*
+	 * This field helps us to find beginning of the remaining tuples
+	 * from postings which follow array of offset numbers.
+	 */
+	uint32		nremaining;
+	uint32		ndeleted;
 
-	/* TARGET OFFSET NUMBERS FOLLOW */
+	/* REMAINING OFFSET NUMBERS FOLLOW (nremaining values) */
+	/* REMAINING TUPLES TO INSERT FOLLOW (if nremaining > 0) */
+	/* TARGET OFFSET NUMBERS FOLLOW (if any) */
 } xl_btree_vacuum;
 
-#define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber))
+#define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, ndeleted) + sizeof(BlockNumber))
 
 /*
  * This is what we need to know about marking an empty branch for deletion.

Re: [HACKERS] [WIP] Effective storage of duplicates in B-tree index.

Reply via email to