From 13eff84d667f452e57bf8a38e516d6edc807ad9f Mon Sep 17 00:00:00 2001
From: erthalion <9erthalion6@gmail.com>
Date: Sat, 15 Sep 2018 21:14:50 +0200
Subject: [PATCH v12] Index skip scan

Implementation of Index Skip Scan (see Loose Index Scan in the wiki [1])
on top of IndexOnlyScan. To make it suitable for both situations when
there are small number of distinct values and significant amount of
distinct values the following approach is taken - instead of searching
from the root for every value we're searching for then first on the
current page, and then if not found continue searching from the root.

Original patch and design were proposed by Thomas Munro [2], revived and
improved by Jesper Pedersen, and a bit adjusted by Dmitry Dolgov.

[1] https://wiki.postgresql.org/wiki/Loose_indexscan
[2] https://www.postgresql.org/message-id/flat/CADLWmXXbTSBxP-MzJuPAYSsL_2f0iPm5VWPbCvDbVvfX93FKkw%40mail.gmail.com
---
 contrib/bloom/blutils.c                       |   1 +
 doc/src/sgml/config.sgml                      |  16 ++
 doc/src/sgml/indexam.sgml                     |  10 ++
 doc/src/sgml/indices.sgml                     |  24 +++
 src/backend/access/brin/brin.c                |   1 +
 src/backend/access/gin/ginutil.c              |   1 +
 src/backend/access/gist/gist.c                |   1 +
 src/backend/access/hash/hash.c                |   1 +
 src/backend/access/index/indexam.c            |  16 ++
 src/backend/access/nbtree/nbtree.c            |  12 ++
 src/backend/access/nbtree/nbtsearch.c         | 245 ++++++++++++++++++++++++++
 src/backend/access/spgist/spgutils.c          |   1 +
 src/backend/commands/explain.c                |  12 ++
 src/backend/executor/nodeIndexonlyscan.c      |  22 +++
 src/backend/nodes/copyfuncs.c                 |   1 +
 src/backend/nodes/outfuncs.c                  |   1 +
 src/backend/nodes/readfuncs.c                 |   1 +
 src/backend/optimizer/path/costsize.c         |   1 +
 src/backend/optimizer/plan/createplan.c       |  10 +-
 src/backend/optimizer/plan/planner.c          |  17 ++
 src/backend/optimizer/util/pathnode.c         |  40 +++++
 src/backend/optimizer/util/plancat.c          |   1 +
 src/backend/utils/misc/guc.c                  |   9 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/access/amapi.h                    |   5 +
 src/include/access/genam.h                    |   1 +
 src/include/access/nbtree.h                   |   5 +
 src/include/nodes/execnodes.h                 |   4 +
 src/include/nodes/pathnodes.h                 |   5 +
 src/include/nodes/plannodes.h                 |   1 +
 src/include/optimizer/cost.h                  |   1 +
 src/include/optimizer/pathnode.h              |   5 +
 src/test/regress/expected/create_index.out    |   1 +
 src/test/regress/expected/select_distinct.out |  25 +++
 src/test/regress/expected/sysviews.out        |   3 +-
 src/test/regress/sql/create_index.sql         |   2 +
 src/test/regress/sql/select_distinct.sql      |   7 +
 37 files changed, 506 insertions(+), 4 deletions(-)

diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
index 6458376578..f637635438 100644
--- a/contrib/bloom/blutils.c
+++ b/contrib/bloom/blutils.c
@@ -129,6 +129,7 @@ blhandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = blbulkdelete;
 	amroutine->amvacuumcleanup = blvacuumcleanup;
 	amroutine->amcanreturn = NULL;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = blcostestimate;
 	amroutine->amoptions = bloptions;
 	amroutine->amproperty = NULL;
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index fe1735722a..0e255ad27f 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -4326,6 +4326,22 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-enable-indexskipscan" xreflabel="enable_indexskipscan">
+      <term><varname>enable_indexskipscan</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>enable_indexskipscan</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Enables or disables the query planner's use of index-skip-scan plan
+        types (see <xref linkend="indexes-index-skip-scans"/>). This parameter requires
+        that <varname>enable_indexonlyscan</varname> is <literal>on</literal>.
+        The default is <literal>on</literal>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-enable-material" xreflabel="enable_material">
       <term><varname>enable_material</varname> (<type>boolean</type>)
       <indexterm>
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index 05102724ea..1b12ad9493 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -135,6 +135,7 @@ typedef struct IndexAmRoutine
     amendscan_function amendscan;
     ammarkpos_function ammarkpos;       /* can be NULL */
     amrestrpos_function amrestrpos;     /* can be NULL */
+    amskip_function amskip;     /* can be NULL */
 
     /* interface functions to support parallel index scans */
     amestimateparallelscan_function amestimateparallelscan;    /* can be NULL */
@@ -666,6 +667,15 @@ amrestrpos (IndexScanDesc scan);
 
   <para>
 <programlisting>
+bool
+amskip (IndexScanDesc scan, ScanDirection direction, int prefix);
+</programlisting>
+  Skip past all tuples where the first 'prefix' columns have the same value as
+  the last tuple returned in the current scan.
+  </para>
+
+  <para>
+<programlisting>
 Size
 amestimateparallelscan (void);
 </programlisting>
diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml
index 9943e8ecd4..776e42ca35 100644
--- a/doc/src/sgml/indices.sgml
+++ b/doc/src/sgml/indices.sgml
@@ -1221,6 +1221,30 @@ SELECT target FROM tests WHERE subject = 'some-subject' AND success;
    and later will recognize such cases and allow index-only scans to be
    generated, but older versions will not.
   </para>
+
+  <sect2 id="indexes-index-skip-scans">
+    <title>Index Skip Scans</title>
+
+    <indexterm zone="indexes-index-skip-scans">
+      <primary>index</primary>
+      <secondary>index-skip scans</secondary>
+    </indexterm>
+    <indexterm zone="indexes-index-skip-scans">
+      <primary>index-skip scan</primary>
+    </indexterm>
+
+    <para>
+     In case if index scan is used to retrieve the distinct values of a column
+     efficiently, it can be not very efficient, since it requires to scan all
+     the equal values of a key. In such cases planner will consider apply index
+     skip scan approach, which is based on the idea of
+     <ulink url="https://wiki.postgresql.org/wiki/Free_Space_Map_Problems">
+     Loose index scan</ulink>. Rather than scanning all equal values of a key,
+     as soon as a new value is found, it will search for a larger value on the
+     same index page, and if not found, restart the search by looking for a
+     larger value. This is much faster when the index has many equal keys.
+    </para>
+  </sect2>
  </sect1>
 
 
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 8f008dd008..639c8d7115 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -108,6 +108,7 @@ brinhandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = brinbulkdelete;
 	amroutine->amvacuumcleanup = brinvacuumcleanup;
 	amroutine->amcanreturn = NULL;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = brincostestimate;
 	amroutine->amoptions = brinoptions;
 	amroutine->amproperty = NULL;
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index afc20232ac..36f32f15a4 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -61,6 +61,7 @@ ginhandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = ginbulkdelete;
 	amroutine->amvacuumcleanup = ginvacuumcleanup;
 	amroutine->amcanreturn = NULL;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = gincostestimate;
 	amroutine->amoptions = ginoptions;
 	amroutine->amproperty = NULL;
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 2ce5425ef9..e44bc353b4 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -84,6 +84,7 @@ gisthandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = gistbulkdelete;
 	amroutine->amvacuumcleanup = gistvacuumcleanup;
 	amroutine->amcanreturn = gistcanreturn;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = gistcostestimate;
 	amroutine->amoptions = gistoptions;
 	amroutine->amproperty = gistproperty;
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index f1f01a0956..07d7eeda56 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -79,6 +79,7 @@ hashhandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = hashbulkdelete;
 	amroutine->amvacuumcleanup = hashvacuumcleanup;
 	amroutine->amcanreturn = NULL;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = hashcostestimate;
 	amroutine->amoptions = hashoptions;
 	amroutine->amproperty = NULL;
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 4ad30186d9..4f3774128b 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -33,6 +33,7 @@
  *		index_can_return	- does index support index-only scans?
  *		index_getprocid - get a support procedure OID
  *		index_getprocinfo - get a support procedure's lookup info
+ *		index_skip		- advance past duplicate key values in a scan
  *
  * NOTES
  *		This file contains the index_ routines which used
@@ -792,6 +793,21 @@ index_can_return(Relation indexRelation, int attno)
 	return indexRelation->rd_indam->amcanreturn(indexRelation, attno);
 }
 
+/* ----------------
+ *		index_skip
+ *
+ *		Skip past all tuples where the first 'prefix' columns have the
+ *		same value as the last tuple returned in the current scan.
+ * ----------------
+ */
+bool
+index_skip(IndexScanDesc scan, ScanDirection direction, int prefix)
+{
+	SCAN_CHECKS;
+
+	return scan->indexRelation->rd_indam->amskip(scan, direction, prefix);
+}
+
 /* ----------------
  *		index_getprocid
  *
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 98917de2ef..134eda34ed 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -130,6 +130,7 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = btbulkdelete;
 	amroutine->amvacuumcleanup = btvacuumcleanup;
 	amroutine->amcanreturn = btcanreturn;
+	amroutine->amskip = btskip;
 	amroutine->amcostestimate = btcostestimate;
 	amroutine->amoptions = btoptions;
 	amroutine->amproperty = btproperty;
@@ -378,6 +379,8 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 	 */
 	so->currTuples = so->markTuples = NULL;
 
+	so->skipScanKey = NULL;
+
 	scan->xs_itupdesc = RelationGetDescr(rel);
 
 	scan->opaque = so;
@@ -445,6 +448,15 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	_bt_preprocess_array_keys(scan);
 }
 
+/*
+ * btskip() -- skip to the beginning of the next key prefix
+ */
+bool
+btskip(IndexScanDesc scan, ScanDirection direction, int prefix)
+{
+	return _bt_skip(scan, direction, prefix);
+}
+
 /*
  *	btendscan() -- close down a scan
  */
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 92832237a8..80330c6fe7 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1192,6 +1192,251 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
 	return true;
 }
 
+/*
+ *  _bt_skip() -- Skip items that have the same prefix as the most recently
+ * fetched index tuple.  The current position is set so that a subsequent call
+ * to _bt_next will fetch the first tuple that differs in the leading 'prefix'
+ * keys.
+ */
+bool
+_bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BTStack stack;
+	Buffer buf;
+	OffsetNumber offnum;
+	BTScanPosItem *currItem;
+	Page		 page;
+	BTPageOpaque opaque;
+	OffsetNumber low, high, compare_offset;
+	Relation 	 indexRel = scan->indexRelation;
+	int 		 compare_value = ScanDirectionIsForward(dir) ? 0 : 1;
+
+	/* We want to return tuples, and we need a starting point */
+	Assert(scan->xs_want_itup);
+	Assert(scan->xs_itup);
+
+	/*
+	 * If skipScanKey is NULL then we initialize it with _bt_mkscankey,
+	 * otherwise we will just update the sk_flags / sk_argument elements
+	 * in order to eliminate repeated free/realloc.
+	 */
+	if (so->skipScanKey == NULL)
+	{
+		so->skipScanKey = _bt_mkscankey(indexRel, scan->xs_itup);
+	}
+	else
+	{
+		TupleDesc	itupdesc;
+		int			indnkeyatts;
+		int i;
+
+		itupdesc = RelationGetDescr(indexRel);
+		indnkeyatts = IndexRelationGetNumberOfKeyAttributes(indexRel);
+		for (i = 0; i < indnkeyatts; i++)
+		{
+			Datum datum;
+			bool null;
+			int flags;
+
+			datum = index_getattr(scan->xs_itup, i + 1, itupdesc, &null);
+			flags = (null ? SK_ISNULL : 0) |
+					(indexRel->rd_indoption[i] << SK_BT_INDOPTION_SHIFT);
+			so->skipScanKey[i].sk_flags = flags;
+			so->skipScanKey[i].sk_argument = datum;
+		}
+	}
+
+	/* Check if the next unique key can be found within the current page */
+	if (BTScanPosIsValid(so->currPos))
+	{
+		buf = so->currPos.buf;
+
+		page = BufferGetPage(buf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+		low = P_FIRSTDATAKEY(opaque);
+		high = PageGetMaxOffsetNumber(page);
+		compare_offset = ScanDirectionIsForward(dir) ? high : low;
+
+		if(_bt_compare(scan->indexRelation, prefix,
+					   so->skipScanKey, page, compare_offset) > compare_value)
+		{
+			bool keyFound = false;
+
+			LockBuffer(buf, BT_READ);
+			offnum = _bt_binsrch(scan->indexRelation, buf, prefix, so->skipScanKey,
+								 ScanDirectionIsForward(dir));
+
+			/* Lock the page for SERIALIZABLE transactions */
+			PredicateLockPage(scan->indexRelation, BufferGetBlockNumber(buf),
+							  scan->xs_snapshot);
+
+			/* We know in which direction to look */
+			_bt_initialize_more_data(so, dir);
+
+			if (ScanDirectionIsForward(dir))
+			{
+				/* Move back for _bt_next */
+				offnum = OffsetNumberPrev(offnum);
+			}
+
+			/* Now read the data */
+			keyFound = _bt_readpage(scan, dir, offnum);
+			_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+
+			if (keyFound)
+			{
+				/* set IndexTuple */
+				currItem = &so->currPos.items[so->currPos.itemIndex];
+				scan->xs_ctup.t_self = currItem->heapTid;
+				if (scan->xs_want_itup)
+					scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+				return true;
+			}
+		}
+	}
+
+	if (BTScanPosIsValid(so->currPos))
+	{
+		ReleaseBuffer(so->currPos.buf);
+		so->currPos.buf = InvalidBuffer;
+	}
+
+	/*
+	 * We haven't found scan key within the current page, so let's scan from
+	 * the root. Use _bt_search and _bt_binsrch to get the buffer and offset
+	 * number
+	 */
+	stack = _bt_search(scan->indexRelation, prefix, so->skipScanKey,
+					   ScanDirectionIsForward(dir), &buf, BT_READ,
+					   scan->xs_snapshot);
+	_bt_freestack(stack);
+	so->currPos.buf = buf;
+	if (ScanDirectionIsForward(dir))
+	{
+		offnum = _bt_binsrch(scan->indexRelation, buf, prefix, so->skipScanKey,
+							 ScanDirectionIsForward(dir));
+	}
+	else
+	{
+		/* For backward scan finding offnum is more involved. It is wrong to
+		 * just use binary search, since we will find the last item from the
+		 * sequence of equal items, and we need the first one. Otherwise e.g.
+		 * backward cursor scan will return an incorrect value. */
+		TupleDesc	itupdesc;
+		int			indnkeyatts;
+		int 		i;
+
+		offnum = _bt_binsrch(scan->indexRelation, buf, prefix, so->skipScanKey,
+							 ScanDirectionIsForward(dir));
+		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+
+		/* One step back to find a previous value */
+		if (_bt_next(scan, dir))
+		{
+			itupdesc = RelationGetDescr(indexRel);
+			indnkeyatts = IndexRelationGetNumberOfKeyAttributes(indexRel);
+			for (i = 0; i < indnkeyatts; i++)
+			{
+				Datum datum;
+				bool null;
+				int flags;
+
+				datum = index_getattr(scan->xs_itup, i + 1, itupdesc, &null);
+				flags = (null ? SK_ISNULL : 0) |
+						(indexRel->rd_indoption[i] << SK_BT_INDOPTION_SHIFT);
+				so->skipScanKey[i].sk_flags = flags;
+				so->skipScanKey[i].sk_argument = datum;
+			}
+
+			/* And now find the last item from the sequence for the current,
+			 * value with the intention do OffsetNumberNext. As a result we
+			 * end up on a first element from the sequence. */
+			buf = so->currPos.buf;
+
+			page = BufferGetPage(buf);
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+			low = P_FIRSTDATAKEY(opaque);
+			high = PageGetMaxOffsetNumber(page);
+			compare_offset = ScanDirectionIsForward(dir) ? high : low;
+
+			if(_bt_compare(scan->indexRelation, prefix,
+						   so->skipScanKey, page, compare_offset) > compare_value)
+			{
+				offnum = _bt_binsrch(scan->indexRelation, buf, prefix, so->skipScanKey,
+									 ScanDirectionIsForward(dir));
+			}
+			else
+			{
+				ReleaseBuffer(so->currPos.buf);
+				so->currPos.buf = InvalidBuffer;
+
+				stack = _bt_search(scan->indexRelation, prefix, so->skipScanKey,
+								   ScanDirectionIsForward(dir), &buf, BT_READ,
+								   scan->xs_snapshot);
+				_bt_freestack(stack);
+				so->currPos.buf = buf;
+				offnum = _bt_binsrch(scan->indexRelation, buf, prefix, so->skipScanKey,
+									 ScanDirectionIsForward(dir));
+			}
+		}
+		else
+		{
+			_bt_freeskey(so->skipScanKey);
+			so->skipScanKey = NULL;
+			return false;
+		}
+	}
+
+	/* Lock the page for SERIALIZABLE transactions */
+	PredicateLockPage(scan->indexRelation, BufferGetBlockNumber(buf),
+					  scan->xs_snapshot);
+
+	/* We know in which direction to look */
+	_bt_initialize_more_data(so, dir);
+
+	if (ScanDirectionIsForward(dir))
+	{
+		/* Move back for _bt_next */
+		offnum = OffsetNumberPrev(offnum);
+	}
+	else
+	{
+		offnum = OffsetNumberNext(offnum);
+	}
+
+	/* Now read the data */
+	if (!_bt_readpage(scan, dir, offnum))
+	{
+		/*
+		 * There's no actually-matching data on this page.  Try to advance to
+		 * the next page.  Return false if there's no matching data at all.
+		 */
+		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+		if (!_bt_steppage(scan, dir))
+		{
+			_bt_freeskey(so->skipScanKey);
+			so->skipScanKey = NULL;
+			return false;
+		}
+	}
+	else
+	{
+		/* Drop the lock, and maybe the pin, on the current page */
+		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+	}
+
+	/* And set IndexTuple */
+	currItem = &so->currPos.items[so->currPos.itemIndex];
+	scan->xs_ctup.t_self = currItem->heapTid;
+	if (scan->xs_want_itup)
+		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+	return true;
+}
+
 /*
  *	_bt_readpage() -- Load data from current index page into so->currPos
  *
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index 8e63c1fad2..a55bd5e9f5 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -64,6 +64,7 @@ spghandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = spgbulkdelete;
 	amroutine->amvacuumcleanup = spgvacuumcleanup;
 	amroutine->amcanreturn = spgcanreturn;
+	amroutine->amskip = NULL;
 	amroutine->amcostestimate = spgcostestimate;
 	amroutine->amoptions = spgoptions;
 	amroutine->amproperty = spgproperty;
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 1831ea81cf..c41995b247 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1297,6 +1297,14 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			{
 				IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan;
 
+				if (indexonlyscan->skipPrefixSize > 0)
+				{
+					if (es->format != EXPLAIN_FORMAT_TEXT)
+						ExplainPropertyInteger("Distinct Prefix", NULL,
+											   indexonlyscan->skipPrefixSize,
+											   es);
+				}
+
 				ExplainIndexScanDetails(indexonlyscan->indexid,
 										indexonlyscan->indexorderdir,
 										es);
@@ -1519,6 +1527,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
 										   planstate, es);
 			break;
 		case T_IndexOnlyScan:
+			if (((IndexOnlyScan *) plan)->skipPrefixSize > 0)
+			{
+				ExplainPropertyText("Scan mode", "Skip scan", es);
+			}
 			show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
 						   "Index Cond", planstate, ancestors, es);
 			if (((IndexOnlyScan *) plan)->indexqual)
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 26758e7703..dced29df04 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -114,6 +114,24 @@ IndexOnlyNext(IndexOnlyScanState *node)
 						 node->ioss_NumOrderByKeys);
 	}
 
+	/*
+	 * Check if we need to skip to the next key prefix, because we've been
+	 * asked to implement DISTINCT.
+	 */
+	if (node->ioss_SkipPrefixSize > 0 && node->ioss_FirstTupleEmitted)
+	{
+		if (!index_skip(scandesc, direction, node->ioss_SkipPrefixSize))
+		{
+			/* Reached end of index. At this point currPos is invalidated,
+			 * and we need to reset ioss_FirstTupleEmitted, since otherwise
+			 * after going backwards, reaching the end of index, and going
+			 * forward again we apply skip again. It would be incorrect and
+			 * lead to an extra skipped item. */
+			node->ioss_FirstTupleEmitted = false;
+			return ExecClearTuple(slot);
+		}
+	}
+
 	/*
 	 * OK, now that we have what we need, fetch the next tuple.
 	 */
@@ -249,6 +267,8 @@ IndexOnlyNext(IndexOnlyScanState *node)
 							  ItemPointerGetBlockNumber(tid),
 							  estate->es_snapshot);
 
+		node->ioss_FirstTupleEmitted = true;
+
 		return slot;
 	}
 
@@ -499,6 +519,8 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 	indexstate->ss.ps.plan = (Plan *) node;
 	indexstate->ss.ps.state = estate;
 	indexstate->ss.ps.ExecProcNode = ExecIndexOnlyScan;
+	indexstate->ioss_SkipPrefixSize = node->skipPrefixSize;
+	indexstate->ioss_FirstTupleEmitted = false;
 
 	/*
 	 * Miscellaneous initialization
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index a8a735c247..17f71cae25 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -514,6 +514,7 @@ _copyIndexOnlyScan(const IndexOnlyScan *from)
 	COPY_NODE_FIELD(indexorderby);
 	COPY_NODE_FIELD(indextlist);
 	COPY_SCALAR_FIELD(indexorderdir);
+	COPY_SCALAR_FIELD(skipPrefixSize);
 
 	return newnode;
 }
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 69179a07c3..6f099d1ed4 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -572,6 +572,7 @@ _outIndexOnlyScan(StringInfo str, const IndexOnlyScan *node)
 	WRITE_NODE_FIELD(indexorderby);
 	WRITE_NODE_FIELD(indextlist);
 	WRITE_ENUM_FIELD(indexorderdir, ScanDirection);
+	WRITE_INT_FIELD(skipPrefixSize);
 }
 
 static void
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 4b845b1bb7..88136560ad 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -1804,6 +1804,7 @@ _readIndexOnlyScan(void)
 	READ_NODE_FIELD(indexorderby);
 	READ_NODE_FIELD(indextlist);
 	READ_ENUM_FIELD(indexorderdir, ScanDirection);
+	READ_INT_FIELD(skipPrefixSize);
 
 	READ_DONE();
 }
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 4b9be13f08..91a35e9beb 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -124,6 +124,7 @@ int			max_parallel_workers_per_gather = 2;
 bool		enable_seqscan = true;
 bool		enable_indexscan = true;
 bool		enable_indexonlyscan = true;
+bool		enable_indexskipscan = true;
 bool		enable_bitmapscan = true;
 bool		enable_tidscan = true;
 bool		enable_sort = true;
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 9fbe5b2a5f..987afe7999 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -178,7 +178,8 @@ static IndexOnlyScan *make_indexonlyscan(List *qptlist, List *qpqual,
 				   Index scanrelid, Oid indexid,
 				   List *indexqual, List *indexorderby,
 				   List *indextlist,
-				   ScanDirection indexscandir);
+				   ScanDirection indexscandir,
+				   int skipprefix);
 static BitmapIndexScan *make_bitmap_indexscan(Index scanrelid, Oid indexid,
 					  List *indexqual,
 					  List *indexqualorig);
@@ -2754,7 +2755,8 @@ create_indexscan_plan(PlannerInfo *root,
 												fixed_indexquals,
 												fixed_indexorderbys,
 												best_path->indexinfo->indextlist,
-												best_path->indexscandir);
+												best_path->indexscandir,
+												best_path->indexskipprefix);
 	else
 		scan_plan = (Scan *) make_indexscan(tlist,
 											qpqual,
@@ -5030,7 +5032,8 @@ make_indexonlyscan(List *qptlist,
 				   List *indexqual,
 				   List *indexorderby,
 				   List *indextlist,
-				   ScanDirection indexscandir)
+				   ScanDirection indexscandir,
+				   int skipPrefixSize)
 {
 	IndexOnlyScan *node = makeNode(IndexOnlyScan);
 	Plan	   *plan = &node->scan.plan;
@@ -5045,6 +5048,7 @@ make_indexonlyscan(List *qptlist,
 	node->indexorderby = indexorderby;
 	node->indextlist = indextlist;
 	node->indexorderdir = indexscandir;
+	node->skipPrefixSize = skipPrefixSize;
 
 	return node;
 }
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 5e3a7120ff..a8c0ba4142 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4691,6 +4691,23 @@ create_distinct_paths(PlannerInfo *root,
 												  path,
 												  list_length(root->distinct_pathkeys),
 												  numDistinctRows));
+
+				/* Also consider a skip scan, if possible. */
+				if (IsA(path, IndexPath) &&
+					path->pathtype == T_IndexOnlyScan &&
+					enable_indexskipscan &&
+					((IndexPath *) path)->indexinfo->amcanskip &&
+					root->distinct_pathkeys != NIL)
+				{
+					int distinctPrefixKeys = list_length(root->distinct_pathkeys);
+
+					add_path(distinct_rel,
+							 create_skipscan_unique_path(root,
+														 distinct_rel,
+														 path,
+														 distinctPrefixKeys,
+														 numDistinctRows));
+				}
 			}
 		}
 
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 169e51e792..140899b556 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -2884,6 +2884,46 @@ create_upper_unique_path(PlannerInfo *root,
 	return pathnode;
 }
 
+/*
+ * create_skipscan_unique_path
+ *	  Creates a pathnode the same as an existing IndexPath except based on
+ *	  skipping duplicate values.  This may or may not be cheaper than using
+ *	  create_upper_unique_path.
+ *
+ * The input path must be an IndexPath for an index that supports amskip.
+ */
+IndexPath *
+create_skipscan_unique_path(PlannerInfo *root,
+							RelOptInfo *rel,
+							Path *basepath,
+							int distinctPrefixKeys,
+							double numGroups)
+{
+	IndexPath *pathnode = makeNode(IndexPath);
+
+	Assert(IsA(basepath, IndexPath));
+
+	/* We don't want to modify basepath, so make a copy. */
+	memcpy(pathnode, basepath, sizeof(IndexPath));
+
+	/* The size of the prefix we'll use for skipping. */
+	Assert(pathnode->indexinfo->amcanskip);
+	Assert(distinctPrefixKeys > 0);
+	Assert(distinctPrefixKeys <= list_length(pathnode->path.pathkeys));
+	pathnode->indexskipprefix = distinctPrefixKeys;
+
+	/*
+	 * The cost to skip to each distinct value should be roughly the same as
+	 * the cost of finding the first key times the number of distinct values
+	 * we expect to find.
+	 */
+	pathnode->path.startup_cost = basepath->startup_cost;
+	pathnode->path.total_cost = basepath->startup_cost * numGroups;
+	pathnode->path.rows = numGroups;
+
+	return pathnode;
+}
+
 /*
  * create_agg_path
  *	  Creates a pathnode that represents performing aggregation/grouping
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 30f4dc151b..81852da830 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -271,6 +271,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			info->amoptionalkey = amroutine->amoptionalkey;
 			info->amsearcharray = amroutine->amsearcharray;
 			info->amsearchnulls = amroutine->amsearchnulls;
+			info->amcanskip = (amroutine->amskip != NULL);
 			info->amcanparallel = amroutine->amcanparallel;
 			info->amhasgettuple = (amroutine->amgettuple != NULL);
 			info->amhasgetbitmap = (amroutine->amgetbitmap != NULL);
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index fe6c6f8a05..dbe46f2959 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -905,6 +905,15 @@ static struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_indexskipscan", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of index-skip-scan plans."),
+			NULL
+		},
+		&enable_indexskipscan,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_bitmapscan", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of bitmap-scan plans."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index cccb5f145a..55da1df89c 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -347,6 +347,7 @@
 #enable_hashjoin = on
 #enable_indexscan = on
 #enable_indexonlyscan = on
+#enable_indexskipscan = on
 #enable_material = on
 #enable_mergejoin = on
 #enable_nestloop = on
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 653ddc976b..082a9bb0d6 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -127,6 +127,10 @@ typedef void (*amrescan_function) (IndexScanDesc scan,
 typedef bool (*amgettuple_function) (IndexScanDesc scan,
 									 ScanDirection direction);
 
+/* skip past duplicates in a given prefix */
+typedef bool (*amskip_function) (IndexScanDesc scan,
+								 ScanDirection dir, int prefix);
+
 /* fetch all valid tuples */
 typedef int64 (*amgetbitmap_function) (IndexScanDesc scan,
 									   TIDBitmap *tbm);
@@ -221,6 +225,7 @@ typedef struct IndexAmRoutine
 	amendscan_function amendscan;
 	ammarkpos_function ammarkpos;	/* can be NULL */
 	amrestrpos_function amrestrpos; /* can be NULL */
+	amskip_function amskip;				/* can be NULL */
 
 	/* interface functions to support parallel index scans */
 	amestimateparallelscan_function amestimateparallelscan; /* can be NULL */
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index c4aba39496..a9bf4f58a9 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -170,6 +170,7 @@ extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
 extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info,
 					 IndexBulkDeleteResult *stats);
 extern bool index_can_return(Relation indexRelation, int attno);
+extern bool index_skip(IndexScanDesc scan, ScanDirection direction, int prefix);
 extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum,
 				uint16 procnum);
 extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum,
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 60622ea790..56689cd1fb 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -470,6 +470,9 @@ typedef struct BTScanOpaqueData
 	 */
 	int			markItemIndex;	/* itemIndex, or -1 if not valid */
 
+	/* Work space for _bt_skip */
+	ScanKey		skipScanKey;	/* used to control skipping */
+
 	/* keep these last in struct for efficiency */
 	BTScanPosData currPos;		/* current position data */
 	BTScanPosData markPos;		/* marked position, if any */
@@ -570,6 +573,7 @@ extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
 			Page page, OffsetNumber offnum);
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
+extern bool _bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 				 Snapshot snapshot);
 
@@ -597,6 +601,7 @@ extern void _bt_end_vacuum_callback(int code, Datum arg);
 extern Size BTreeShmemSize(void);
 extern void BTreeShmemInit(void);
 extern bytea *btoptions(Datum reloptions, bool validate);
+extern bool btskip(IndexScanDesc scan, ScanDirection dir, int prefix);
 extern bool btproperty(Oid index_oid, int attno,
 		   IndexAMProperty prop, const char *propname,
 		   bool *res, bool *isnull);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index fd13c170d7..2598462e71 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1393,6 +1393,8 @@ typedef struct IndexScanState
  *		RelationDesc	   index relation descriptor
  *		ScanDesc		   index scan descriptor
  *		VMBuffer		   buffer in use for visibility map testing, if any
+ *		SkipPrefixSize	   number of keys for skip-based DISTINCT
+ *		FirstTupleEmitted  has the first tuple been emitted
  *		ioss_PscanLen	   Size of parallel index-only scan descriptor
  * ----------------
  */
@@ -1411,6 +1413,8 @@ typedef struct IndexOnlyScanState
 	Relation	ioss_RelationDesc;
 	struct IndexScanDescData *ioss_ScanDesc;
 	Buffer		ioss_VMBuffer;
+	int         ioss_SkipPrefixSize;
+	bool		ioss_FirstTupleEmitted;
 	Size		ioss_PscanLen;
 } IndexOnlyScanState;
 
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 253e0b7e48..653f3b174b 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -819,6 +819,7 @@ struct IndexOptInfo
 	bool		amsearchnulls;	/* can AM search for NULL/NOT NULL entries? */
 	bool		amhasgettuple;	/* does AM have amgettuple interface? */
 	bool		amhasgetbitmap; /* does AM have amgetbitmap interface? */
+	bool		amcanskip;		/* can AM skip duplicate values? */
 	bool		amcanparallel;	/* does AM support parallel scan? */
 	/* Rather than include amapi.h here, we declare amcostestimate like this */
 	void		(*amcostestimate) ();	/* AM's cost estimator */
@@ -1155,6 +1156,9 @@ typedef struct Path
  * we need not recompute them when considering using the same index in a
  * bitmap index/heap scan (see BitmapHeapPath).  The costs of the IndexPath
  * itself represent the costs of an IndexScan or IndexOnlyScan plan type.
+ *
+ * 'indexskipprefix' represents the number of columns to consider for skip
+ * scans.
  *----------
  */
 typedef struct IndexPath
@@ -1167,6 +1171,7 @@ typedef struct IndexPath
 	ScanDirection indexscandir;
 	Cost		indextotalcost;
 	Selectivity indexselectivity;
+	int			indexskipprefix;
 } IndexPath;
 
 /*
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index d66a187a53..dbd4ca6216 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -431,6 +431,7 @@ typedef struct IndexOnlyScan
 	List	   *indexorderby;	/* list of index ORDER BY exprs */
 	List	   *indextlist;		/* TargetEntry list describing index's cols */
 	ScanDirection indexorderdir;	/* forward or backward or don't care */
+	int			skipPrefixSize;	/* the size of the prefix for distinct scans */
 } IndexOnlyScan;
 
 /* ----------------
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index ac6de0f6be..33c7a0a376 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -50,6 +50,7 @@ extern PGDLLIMPORT int max_parallel_workers_per_gather;
 extern PGDLLIMPORT bool enable_seqscan;
 extern PGDLLIMPORT bool enable_indexscan;
 extern PGDLLIMPORT bool enable_indexonlyscan;
+extern PGDLLIMPORT bool enable_indexskipscan;
 extern PGDLLIMPORT bool enable_bitmapscan;
 extern PGDLLIMPORT bool enable_tidscan;
 extern PGDLLIMPORT bool enable_sort;
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 574bb85b50..2c480bc9c8 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -202,6 +202,11 @@ extern UpperUniquePath *create_upper_unique_path(PlannerInfo *root,
 						 Path *subpath,
 						 int numCols,
 						 double numGroups);
+extern IndexPath *create_skipscan_unique_path(PlannerInfo *root,
+				  RelOptInfo *rel,
+				  Path *subpath,
+				  int numCols,
+				  double numGroups);
 extern AggPath *create_agg_path(PlannerInfo *root,
 				RelOptInfo *rel,
 				Path *subpath,
diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
index 5d4eb59a0c..f982ddbfb6 100644
--- a/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@@ -19,6 +19,7 @@ CREATE INDEX tenk1_unique1 ON tenk1 USING btree(unique1 int4_ops);
 CREATE INDEX tenk1_unique2 ON tenk1 USING btree(unique2 int4_ops);
 CREATE INDEX tenk1_hundred ON tenk1 USING btree(hundred int4_ops);
 CREATE INDEX tenk1_thous_tenthous ON tenk1 (thousand, tenthous);
+CREATE INDEX tenk1_four ON tenk1 (four);
 CREATE INDEX tenk2_unique1 ON tenk2 USING btree(unique1 int4_ops);
 CREATE INDEX tenk2_unique2 ON tenk2 USING btree(unique2 int4_ops);
 CREATE INDEX tenk2_hundred ON tenk2 USING btree(hundred int4_ops);
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out
index f3696c6d1d..38c9bc4b9b 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -244,3 +244,28 @@ SELECT null IS NOT DISTINCT FROM null as "yes";
  t
 (1 row)
 
+-- index skip scan
+SELECT DISTINCT four FROM tenk1;
+ four 
+------
+    0
+    1
+    2
+    3
+(4 rows)
+
+SELECT DISTINCT four FROM tenk1 WHERE four = 1;
+ four 
+------
+    1
+(1 row)
+
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT DISTINCT four FROM tenk1;
+                    QUERY PLAN                    
+--------------------------------------------------
+ Index Only Scan using tenk1_four on public.tenk1
+   Output: four
+   Scan mode: Skip scan
+(3 rows)
+
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index a1c90eb905..bd3b373515 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -78,6 +78,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_hashjoin                | on
  enable_indexonlyscan           | on
  enable_indexscan               | on
+ enable_indexskipscan           | on
  enable_material                | on
  enable_mergejoin               | on
  enable_nestloop                | on
@@ -89,7 +90,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_seqscan                 | on
  enable_sort                    | on
  enable_tidscan                 | on
-(17 rows)
+(18 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql
index 67ecad8dd5..f6e95eae57 100644
--- a/src/test/regress/sql/create_index.sql
+++ b/src/test/regress/sql/create_index.sql
@@ -26,6 +26,8 @@ CREATE INDEX tenk1_hundred ON tenk1 USING btree(hundred int4_ops);
 
 CREATE INDEX tenk1_thous_tenthous ON tenk1 (thousand, tenthous);
 
+CREATE INDEX tenk1_four ON tenk1 (four);
+
 CREATE INDEX tenk2_unique1 ON tenk2 USING btree(unique1 int4_ops);
 
 CREATE INDEX tenk2_unique2 ON tenk2 USING btree(unique2 int4_ops);
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql
index a605e86449..992e8d7c4d 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -73,3 +73,10 @@ SELECT 1 IS NOT DISTINCT FROM 2 as "no";
 SELECT 2 IS NOT DISTINCT FROM 2 as "yes";
 SELECT 2 IS NOT DISTINCT FROM null as "no";
 SELECT null IS NOT DISTINCT FROM null as "yes";
+
+-- index skip scan
+SELECT DISTINCT four FROM tenk1;
+SELECT DISTINCT four FROM tenk1 WHERE four = 1;
+
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT DISTINCT four FROM tenk1;
-- 
2.16.4

