Re: [HACKERS] B-Tree support function number 3 (strxfrm() optimization)

Peter Geoghegan Tue, 02 Dec 2014 17:29:53 -0800

On Tue, Dec 2, 2014 at 2:16 PM, Peter Geoghegan <p...@heroku.com> wrote:
> On Tue, Dec 2, 2014 at 2:07 PM, Robert Haas <robertmh...@gmail.com> wrote:
>> Well, maybe you should make the updates we've agreed on and I can take
>> another look at it.
>
> Agreed.


Attached, revised patchset makes these updates. I continue to use the
sortsupport struct to convey that a given attribute of a given sort is
abbreviation-applicable (although the field is now a bool, not an
enum).

-- 
Peter Geoghegan

From 865a1abeb6bfb601b1ec605afb1e339c0e444e10 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <p...@heroku.com>
Date: Sun, 9 Nov 2014 14:38:44 -0800
Subject: [PATCH 2/2] Estimate total number of rows to be sorted

Sortsupport opclasses now accept a row hint, indicating the estimated
number of rows to be sorted.  This gives opclasses a sense of proportion
about how far along the copying of tuples is when considering aborting
abbreviation.

Estimates come from various sources.  The text opclass now always avoids
aborting abbreviation if the total number of rows to be sorted is high
enough, without considering cardinality at all.
---
 src/backend/access/nbtree/nbtree.c     |  5 ++-
 src/backend/access/nbtree/nbtsort.c    | 14 +++++-
 src/backend/commands/cluster.c         |  4 +-
 src/backend/executor/nodeAgg.c         |  5 ++-
 src/backend/executor/nodeSort.c        |  1 +
 src/backend/utils/adt/orderedsetaggs.c |  2 +-
 src/backend/utils/adt/varlena.c        | 80 ++++++++++++++++++++++++++++++++--
 src/backend/utils/sort/tuplesort.c     | 14 ++++--
 src/include/access/nbtree.h            |  2 +-
 src/include/utils/sortsupport.h        |  7 ++-
 src/include/utils/tuplesort.h          |  6 +--
 11 files changed, 121 insertions(+), 19 deletions(-)

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index d881525..d26c60b 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -109,14 +109,15 @@ btbuild(PG_FUNCTION_ARGS)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));
 
-	buildstate.spool = _bt_spoolinit(heap, index, indexInfo->ii_Unique, false);
+	buildstate.spool = _bt_spoolinit(heap, index, indexInfo->ii_Unique,
+									 indexInfo->ii_Predicate != NIL, false);
 
 	/*
 	 * If building a unique index, put dead tuples in a second spool to keep
 	 * them out of the uniqueness check.
 	 */
 	if (indexInfo->ii_Unique)
-		buildstate.spool2 = _bt_spoolinit(heap, index, false, true);
+		buildstate.spool2 = _bt_spoolinit(heap, index, false, true, true);
 
 	/* do the heap scan */
 	reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 593571b..473ac54 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -73,6 +73,7 @@
 #include "storage/smgr.h"
 #include "tcop/tcopprot.h"
 #include "utils/rel.h"
+#include "utils/selfuncs.h"
 #include "utils/sortsupport.h"
 #include "utils/tuplesort.h"
 
@@ -149,10 +150,13 @@ static void _bt_load(BTWriteState *wstate,
  * create and initialize a spool structure
  */
 BTSpool *
-_bt_spoolinit(Relation heap, Relation index, bool isunique, bool isdead)
+_bt_spoolinit(Relation heap, Relation index, bool isunique, bool ispartial,
+			  bool isdead)
 {
 	BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
 	int			btKbytes;
+	double		estRows;
+	float4		relTuples;
 
 	btspool->heap = heap;
 	btspool->index = index;
@@ -165,10 +169,16 @@ _bt_spoolinit(Relation heap, Relation index, bool isunique, bool isdead)
 	 * unique index actually requires two BTSpool objects.  We expect that the
 	 * second one (for dead tuples) won't get very full, so we give it only
 	 * work_mem.
+	 *
+	 * Certain cases will always have a relTuples of 0, such as reindexing as
+	 * part of a CLUSTER operation, or when reindexing toast tables.  This is
+	 * interpreted as "no estimate available".
 	 */
 	btKbytes = isdead ? work_mem : maintenance_work_mem;
+	relTuples = RelationGetForm(heap)->reltuples;
+	estRows =  relTuples * (isdead || ispartial ?  DEFAULT_INEQ_SEL : 1);
 	btspool->sortstate = tuplesort_begin_index_btree(heap, index, isunique,
-													 btKbytes, false);
+													 btKbytes, estRows, false);
 
 	return btspool;
 }
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index bc5f33f..8e5f536 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -890,7 +890,9 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 	/* Set up sorting if wanted */
 	if (use_sort)
 		tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
-											maintenance_work_mem, false);
+											maintenance_work_mem,
+											RelationGetForm(OldHeap)->reltuples,
+											false);
 	else
 		tuplesort = NULL;
 
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 89de755..95143c3 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -346,6 +346,7 @@ initialize_aggregates(AggState *aggstate,
 	{
 		AggStatePerAgg peraggstate = &peragg[aggno];
 		AggStatePerGroup pergroupstate = &pergroup[aggno];
+		Agg		   		*node = (Agg *) aggstate->ss.ps.plan;
 
 		/*
 		 * Start a fresh sort operation for each DISTINCT/ORDER BY aggregate.
@@ -381,7 +382,9 @@ initialize_aggregates(AggState *aggstate,
 									 peraggstate->sortOperators,
 									 peraggstate->sortCollations,
 									 peraggstate->sortNullsFirst,
-									 work_mem, false);
+									 work_mem,
+									 node->plan.plan_rows,
+									 false);
 		}
 
 		/*
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
index b88571b..31d3ead 100644
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -89,6 +89,7 @@ ExecSort(SortState *node)
 											  plannode->collations,
 											  plannode->nullsFirst,
 											  work_mem,
+											  plannode->plan.plan_rows,
 											  node->randomAccess);
 		if (node->bounded)
 			tuplesort_set_bound(tuplesortstate, node->bound);
diff --git a/src/backend/utils/adt/orderedsetaggs.c b/src/backend/utils/adt/orderedsetaggs.c
index 9d7c71f..4ecf48e 100644
--- a/src/backend/utils/adt/orderedsetaggs.c
+++ b/src/backend/utils/adt/orderedsetaggs.c
@@ -280,7 +280,7 @@ ordered_set_startup(FunctionCallInfo fcinfo, bool use_tuples)
 												   qstate->sortOperators,
 												   qstate->sortCollations,
 												   qstate->sortNullsFirsts,
-												   work_mem, false);
+												   work_mem, -1, false);
 	else
 		osastate->sortstate = tuplesort_begin_datum(qstate->sortColType,
 													qstate->sortOperator,
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 34f607d..741fb59 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -16,6 +16,7 @@
 
 #include <ctype.h>
 #include <limits.h>
+#include <math.h>
 
 #include "access/hash.h"
 #include "access/tuptoaster.h"
@@ -82,12 +83,18 @@ typedef struct
 #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
 #define PG_RETURN_UNKNOWN_P(x)		PG_RETURN_POINTER(x)
 
+/*
+ * Used for calculating number of sort comparisons
+ */
+#define LOG2(x)  (log(x) / 0.693147180559945)
+
 static void btsortsupport_worker(SortSupport ssup, Oid collid);
 static int bttextfastcmp_c(Datum x, Datum y, SortSupport ssup);
 static int bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup);
 static int bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup);
 static Datum bttext_abbrev_convert(Datum original, SortSupport ssup);
-static bool bttext_abbrev_abort(int memtupcount, SortSupport ssup);
+static bool bttext_abbrev_abort(int memtupcount, double estrows,
+								SortSupport ssup);
 static int32 text_length(Datum str);
 static text *text_catenate(text *t1, text *t2);
 static text *text_substring(Datum str,
@@ -2114,17 +2121,84 @@ retry:
  * should be aborted, based on its projected effectiveness.
  */
 static bool
-bttext_abbrev_abort(int memtupcount, SortSupport ssup)
+bttext_abbrev_abort(int memtupcount, double estrows, SortSupport ssup)
 {
 	TextSortSupport	   *tss = (TextSortSupport *) ssup->ssup_extra;
 	double				abbrev_distinct, key_distinct;
 
 	Assert(ssup->abbreviate);
 
-	/* Have a little patience */
+	/* Have a little patience, even without estrows hint */
 	if (memtupcount < 20)
 		return false;
 
+	if (estrows > 0)
+	{
+		double		normalized_rows_to_process,
+					estimated_cmps;
+
+		normalized_rows_to_process = (estrows - memtupcount) / estrows;
+
+		if (normalized_rows_to_process > 0.95 && memtupcount < 200000)
+		{
+			/*
+			 * Be patient -- don't consider aborting until we've processed an
+			 * estimated 5% of all rows to be sorted, or 200,000 rows,
+			 * whichever is less.
+			 */
+#ifdef DEBUG_ABBREV_KEYS
+			elog(DEBUG_elog_output, "conversion patiently waited after %d tuples of %f",
+				 memtupcount, estrows);
+#endif
+			return false;
+		}
+		else if (normalized_rows_to_process < 0.65)
+		{
+			/*
+			 * Already too invested -- don't abort a marginal case.  Note that
+			 * clients will tend to stop calling here when it is established
+			 * that it is too late to abort anyway.
+			 */
+			return false;
+		}
+
+		/*
+		 * strxfrm() is strongly recommended for large lists of strings.  This
+		 * is because despite the memory overhead often implied by an approach
+		 * using string transformation, the number of comparisons that a
+		 * comparison sort algorithm requires increases at least in proportion
+		 * to O(n log n).  Linearithmic growth will result in a number of
+		 * comparisons that is considerably higher than the number of elements.
+		 * (top-N heapsorts never use the abbreviation optimization, and so are
+		 * not considered here.)
+		 *
+		 * Unicode Technical Standard #10 states "Because binary comparison is
+		 * much faster than string comparison, it is faster to use sort keys
+		 * whenever there will be more than about 10 comparisons per string, if
+		 * the system can afford the storage".  That would amount to
+		 * approximately 1,000 list elements on average.  While our costs are
+		 * clearly different in several ways, this calculus cannot be ignored
+		 * entirely.  Past a certain point, we are probabilistically better off
+		 * holding out for some improvement even if there is an abbreviated key
+		 * cardinality of 1 thus far.  That point is somewhat arbitrarily
+		 * assumed to be 20 comparisons per string (approximately 1 million
+		 * estimated rows).  We may still lose, but not by terribly much, and
+		 * only in cases close to the most pessimal worst case.  Even in that
+		 * very worst case, as this tuple count threshold is crossed the
+		 * regression for internal sorts is at or under 5%.
+		 */
+		estimated_cmps = estrows * LOG2(estrows);
+
+		if (estimated_cmps > estrows * 20)
+		{
+#ifdef DEBUG_ABBREV_KEYS
+			elog(DEBUG_elog_output, "row estimate too high (%f, estimated cmps: %f) to ever abort",
+				 estrows, estimated_cmps);
+#endif
+			return false;
+		}
+	}
+
 	abbrev_distinct = estimateHyperLogLog(&tss->abbr_card);
 	key_distinct = estimateHyperLogLog(&tss->full_card);
 
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 4ccb766..a35cef0 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -356,6 +356,7 @@ struct Tuplesortstate
 	 * effectiveness is tested.
 	 */
 	int64		abbrevNext;		/* Tuple # at which to next check applicability */
+	double		abbrevEstRow;	/* Estimated # rows to be sorted, 0 >= if unknown */
 
 	/*
 	 * These variables are specific to the CLUSTER case; they are set by
@@ -600,7 +601,8 @@ tuplesort_begin_heap(TupleDesc tupDesc,
 					 int nkeys, AttrNumber *attNums,
 					 Oid *sortOperators, Oid *sortCollations,
 					 bool *nullsFirstFlags,
-					 int workMem, bool randomAccess)
+					 int workMem, double estRows,
+					 bool randomAccess)
 {
 	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
 	MemoryContext oldcontext;
@@ -632,6 +634,7 @@ tuplesort_begin_heap(TupleDesc tupDesc,
 
 	state->tupDesc = tupDesc;	/* assume we need not copy tupDesc */
 	state->abbrevNext = 10;
+	state->abbrevEstRow = estRows;
 
 	/* Prepare SortSupport data for each column */
 	state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
@@ -670,7 +673,8 @@ tuplesort_begin_heap(TupleDesc tupDesc,
 Tuplesortstate *
 tuplesort_begin_cluster(TupleDesc tupDesc,
 						Relation indexRel,
-						int workMem, bool randomAccess)
+						int workMem,
+						double estRows, bool randomAccess)
 {
 	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
 	ScanKey			indexScanKey;
@@ -702,6 +706,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc,
 	state->writetup = writetup_cluster;
 	state->readtup = readtup_cluster;
 	state->abbrevNext = 10;
+	state->abbrevEstRow = estRows;
 
 	state->indexInfo = BuildIndexInfo(indexRel);
 
@@ -763,7 +768,8 @@ Tuplesortstate *
 tuplesort_begin_index_btree(Relation heapRel,
 							Relation indexRel,
 							bool enforceUnique,
-							int workMem, bool randomAccess)
+							int workMem,
+							double estRows, bool randomAccess)
 {
 	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
 	ScanKey			indexScanKey;
@@ -793,6 +799,7 @@ tuplesort_begin_index_btree(Relation heapRel,
 	state->writetup = writetup_index;
 	state->readtup = readtup_index;
 	state->abbrevNext = 10;
+	state->abbrevEstRow = estRows;
 
 	state->heapRel = heapRel;
 	state->indexRel = indexRel;
@@ -1475,6 +1482,7 @@ consider_abort_common(Tuplesortstate *state)
 		 * indicate that abbreviation should not proceed.
 		 */
 		if (!state->sortKeys->abbrev_abort(state->memtupcount,
+										   state->abbrevEstRow,
 										   state->sortKeys))
 			return false;
 
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index d3d258b..1143a33 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -711,7 +711,7 @@ extern void BTreeShmemInit(void);
 typedef struct BTSpool BTSpool; /* opaque type known only within nbtsort.c */
 
 extern BTSpool *_bt_spoolinit(Relation heap, Relation index,
-			  bool isunique, bool isdead);
+			  bool isunique, bool ispartial, bool isdead);
 extern void _bt_spooldestroy(BTSpool *btspool);
 extern void _bt_spool(BTSpool *btspool, ItemPointer self,
 		  Datum *values, bool *isnull);
diff --git a/src/include/utils/sortsupport.h b/src/include/utils/sortsupport.h
index 4c99ed6..659233b 100644
--- a/src/include/utils/sortsupport.h
+++ b/src/include/utils/sortsupport.h
@@ -176,9 +176,12 @@ typedef struct SortSupportData
 	 * If there is a lot of duplicate abbreviated keys in practice, it's useful
 	 * to be able to abandon the strategy before paying too high a cost in
 	 * conversion (perhaps certain opclass-specific adaptations are useful
-	 * too).
+	 * too).  estrows is typically an estimate of total rows that will be
+	 * sorted originating from the planner. (By convention, interpretation is
+	 * "no hint available" when estrows <= 0.)
 	 */
-	bool			(*abbrev_abort) (int memtupcount, SortSupport ssup);
+	bool			(*abbrev_abort) (int memtupcount, double estrows,
+									 SortSupport ssup);
 
 	/*
 	 * Full, authoritative comparator for key that an abbreviated
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index 2537883..06e35d5 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -62,14 +62,14 @@ extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc,
 					 int nkeys, AttrNumber *attNums,
 					 Oid *sortOperators, Oid *sortCollations,
 					 bool *nullsFirstFlags,
-					 int workMem, bool randomAccess);
+					 int workMem, double estRows, bool randomAccess);
 extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc,
 						Relation indexRel,
-						int workMem, bool randomAccess);
+						int workMem, double estRows, bool randomAccess);
 extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel,
 							Relation indexRel,
 							bool enforceUnique,
-							int workMem, bool randomAccess);
+							int workMem, double estRows, bool randomAccess);
 extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel,
 						   Relation indexRel,
 						   uint32 hash_mask,
-- 
1.9.1

From d87da9491832946050aad8ec6ced48905a0a858b Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <p...@heroku.com>
Date: Sat, 2 Aug 2014 15:39:28 -0700
Subject: [PATCH 1/2] Abbreviated sortsupport keys

Add new abbreviation infrastructure to sortsupport, and add a single
client of this new infrastructure, the text sortsupport routine.

Abbreviation gives opclasses the option to provide abbreviated
representations of Datums in advance of sorting, generated through an
opclass-specific encoding scheme used as tuples are copied into the
memtuples array by tuplesort.  An abbreviated representation is
typically pass by value, and the capability is thought to mostly be
useful for accelerating sorts of pass by reference types, but there are
no hard requirements on representation.

Opclasses that support this new capability provide a special
abbreviation-aware comparator (in addition to the usual, ordinary
sortsupport fmgr-eliding comparator).  This comparator returns a value
less than, equal to, or greater than 0, in a manner similar to the usual
fmgr-eliding comparator (or a manner similar to any btree support
function 1).  However, there is an important difference: sortsupport
clients such as tuplesort interpret the value 0 as "inconclusive".
Inconclusive comparisons obligate these clients to perform a tie-breaker
ordinary comparison in respect of the same attribute.

By generating abbreviated keys, both internal and external sorts can be
sped up considerably.  Because of the high cost of CPU cache stalls when
sorting, cache efficiency is a crucial consideration in engineering
sorting algorithms and related infrastructure.  Pass by value
representations have much greater temporal and spatial locality than
pass by reference representations.  Furthermore, the abbreviation-aware
comparisons can themselves be far cheaper than an authoritative
comparison, if the abbreviation encoding scheme can produce a
"reductionist" representation sufficient to resolve most comparisons.
Encoding may be expensive, but if an encoding process can occur N times,
rather than having roughly comparable comparisons occur N log N times,
that can be a highly effective optimization, particularly in the average
and worst case.

The expense of encoding may well be problematic if no benefit can be
derived from abbreviated keys during sorting, though.  Opclasses
supporting abbreviated keys also provide an "abort" callback.  This can
either abort the abbreviation encoding process entirely, or give the
encoding scheme the ability to adapt its encoding strategy in order to
maximize the number of comparisons that are resolved with abbreviated
keys.  HyperLogLog, a streaming cardinality estimator algorithm is added
to give clients of the new sortsupport infrastructure a principled way
to estimate the usefulness of encoding.

Not all tuplesort clients use the optimization.  In two instances where
it might be considered applicable in principle but isn't used all the
same (nodeAgg.c and orderedsetaggs.c), this is noted as an open item
(presumably this will be addressed in the future).  In all other cases,
we deliberately limit abbreviation because it isn't likely to pay off or
is otherwise legitimately not applicable.  For example, this is the case
for top-N heapsorts.

Support for abbreviation within text sortsupport works by copying the
first sizeof(Datum) bytes of a strxfrm()-returned blob to a Datum.
Abbreviation-aware comparisons are simple memcmp() comparisons of this
representation.  As is always the case with text, tie-breakers
optimistically attempt to get away with a simple memcmp() of the
original text string before an expensive strcoll() is performed, so low
cardinality attributes can still benefit from this optimization to a
considerable degree (in particular, the text ad-hoc cost model ought to
not discriminate against them needlessly).
---
 src/backend/access/nbtree/nbtsort.c    |   2 +
 src/backend/commands/analyze.c         |   6 +
 src/backend/executor/nodeAgg.c         |   4 +
 src/backend/executor/nodeMergeAppend.c |   9 +
 src/backend/executor/nodeMergejoin.c   |   8 +
 src/backend/lib/Makefile               |   2 +-
 src/backend/lib/hyperloglog.c          | 228 ++++++++++++++++++
 src/backend/utils/adt/orderedsetaggs.c |   8 +-
 src/backend/utils/adt/varlena.c        | 330 ++++++++++++++++++++++++--
 src/backend/utils/sort/tuplesort.c     | 413 +++++++++++++++++++++++++++++----
 src/include/lib/hyperloglog.h          |  67 ++++++
 src/include/pg_config_manual.h         |  14 +-
 src/include/utils/sortsupport.h        | 134 ++++++++++-
 13 files changed, 1149 insertions(+), 76 deletions(-)
 create mode 100644 src/backend/lib/hyperloglog.c
 create mode 100644 src/include/lib/hyperloglog.h

diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 7616882..593571b 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -717,6 +717,8 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 			sortKey->ssup_nulls_first =
 				(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
 			sortKey->ssup_attno = scanKey->sk_attno;
+			/* Abbreviation is not supported here */
+			sortKey->abbreviate = false;
 
 			AssertState(sortKey->ssup_attno != 0);
 
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 732ab22..91c0ca3 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -2301,6 +2301,12 @@ compute_scalar_stats(VacAttrStatsP stats,
 	/* We always use the default collation for statistics */
 	ssup.ssup_collation = DEFAULT_COLLATION_OID;
 	ssup.ssup_nulls_first = false;
+	/*
+	 * For now, don't perform abbreviated key conversion, because full values
+	 * are required for MCV slot generation.  Supporting that optimization
+	 * would necessitate teaching compare_scalars() to call a tie-breaker.
+	 */
+	ssup.abbreviate = false;
 
 	PrepareSortSupportFromOrderingOp(mystats->ltopr, &ssup);
 
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 510d1c5..89de755 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -363,6 +363,10 @@ initialize_aggregates(AggState *aggstate,
 			 * We use a plain Datum sorter when there's a single input column;
 			 * otherwise sort the full tuple.  (See comments for
 			 * process_ordered_aggregate_single.)
+			 *
+			 * In the future, we should consider forcing the
+			 * tuplesort_begin_heap() case when the abbreviated key
+			 * optimization can thereby be used, even when numInputs is 1.
 			 */
 			peraggstate->sortstate =
 				(peraggstate->numInputs == 1) ?
diff --git a/src/backend/executor/nodeMergeAppend.c b/src/backend/executor/nodeMergeAppend.c
index 47ed068..bd30c57 100644
--- a/src/backend/executor/nodeMergeAppend.c
+++ b/src/backend/executor/nodeMergeAppend.c
@@ -137,6 +137,15 @@ ExecInitMergeAppend(MergeAppend *node, EState *estate, int eflags)
 		sortKey->ssup_nulls_first = node->nullsFirst[i];
 		sortKey->ssup_attno = node->sortColIdx[i];
 
+		/*
+		 * It isn't feasible to perform abbreviated key conversion, since
+		 * tuples are pulled into mergestate's binary heap as needed.  It would
+		 * likely be counter-productive to convert tuples into an abbreviated
+		 * representation as they're pulled up, so opt out of that additional
+		 * optimization entirely.
+		 */
+		sortKey->abbreviate = false;
+
 		PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey);
 	}
 
diff --git a/src/backend/executor/nodeMergejoin.c b/src/backend/executor/nodeMergejoin.c
index fdf2f4c..1f355f9 100644
--- a/src/backend/executor/nodeMergejoin.c
+++ b/src/backend/executor/nodeMergejoin.c
@@ -229,6 +229,14 @@ MJExamineQuals(List *mergeclauses,
 			elog(ERROR, "cannot merge using non-equality operator %u",
 				 qual->opno);
 
+		/*
+		 * sortsupport routine must know if abbreviation optimization is
+		 * applicable in principle.  It is never applicable for merge joins
+		 * because there is no convenient opportunity to convert to alternative
+		 * representation.  Only elide fmgr trampoline where supported.
+		 */
+		clause->ssup.abbreviate = false;
+
 		/* And get the matching support or comparison function */
 		Assert(clause->ssup.comparator == NULL);
 		sortfunc = get_opfamily_proc(opfamily,
diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile
index 327a1bc..14f9a46 100644
--- a/src/backend/lib/Makefile
+++ b/src/backend/lib/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/lib
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = ilist.o binaryheap.o stringinfo.o
+OBJS = ilist.o binaryheap.o hyperloglog.o stringinfo.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/lib/hyperloglog.c b/src/backend/lib/hyperloglog.c
new file mode 100644
index 0000000..1157e9a
--- /dev/null
+++ b/src/backend/lib/hyperloglog.c
@@ -0,0 +1,228 @@
+/*-------------------------------------------------------------------------
+ *
+ * hyperloglog.c
+ *	  HyperLogLog cardinality estimator
+ *
+ * Portions Copyright (c) 2014, PostgreSQL Global Development Group
+ *
+ * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
+ * suited to estimating the cardinality of very large sets;  in particular, we
+ * have not attempted to further optimize the implementation as described in
+ * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
+ * Engineering of a State of The Art Cardinality Estimation Algorithm".
+ *
+ * A sparse representation of HyperLogLog state is used, with fixed space
+ * overhead.
+ *
+ * The copyright terms of Ohno's original version (the MIT license) follow.
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/hyperloglog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the 'Software'), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "postgres.h"
+
+#include <math.h>
+
+#include "lib/hyperloglog.h"
+
+#define POW_2_32			(4294967296.0)
+#define NEG_POW_2_32		(-4294967296.0)
+
+static inline uint8 rho(uint32 x, uint8 b);
+
+/*
+ * Initialize HyperLogLog track state
+ *
+ * bwidth is bit width (so register size will be 2 to the power of bwidth).
+ * Must be between 4 and 16 inclusive.
+ */
+void
+initHyperLogLog(hyperLogLogState *cState, uint8 bwidth)
+{
+	double		alpha;
+
+	if (bwidth < 4 || bwidth > 16)
+		elog(ERROR, "bit width must be between 4 and 16 inclusive");
+
+	cState->registerWidth = bwidth;
+	cState->nRegisters = 1 << bwidth;
+	cState->arrSize = sizeof(uint8) * cState->nRegisters + 1;
+
+	/*
+	 * Initialize hashes array to zero, not negative infinity, per discussion
+	 * of the coupon collector problem in the HyperLogLog paper
+	 */
+	cState->hashesArr = palloc0(cState->arrSize);
+
+	/*
+	 * "alpha" is a value that for each possible number of registers (m) is
+	 * used to correct a systematic multiplicative bias present in m ^ 2 Z (Z
+	 * is "the indicator function" through which we finally compute E,
+	 * estimated cardinality).
+	 */
+	switch (cState->nRegisters)
+	{
+		case 16:
+			alpha = 0.673;
+			break;
+		case 32:
+			alpha = 0.697;
+			break;
+		case 64:
+			alpha = 0.709;
+			break;
+		default:
+			alpha = 0.7213 / (1.0 + 1.079 / cState->nRegisters);
+	}
+
+	/*
+	 * Precalculate alpha m ^ 2, later used to generate "raw" HyperLogLog
+	 * estimate E
+	 */
+	cState->alphaMM = alpha * cState->nRegisters * cState->nRegisters;
+}
+
+/*
+ * Adds element to the estimator, from caller-supplied hash.
+ *
+ * It is critical that the hash value passed be an actual hash value, typically
+ * generated using hash_any().  The algorithm relies on a specific bit-pattern
+ * observable in conjunction with stochastic averaging.  There must be a
+ * uniform distribution of bits in hash values for each distinct original value
+ * observed.
+ */
+void
+addHyperLogLog(hyperLogLogState *cState, uint32 hash)
+{
+	uint8		count;
+	uint32		index;
+
+	/* Use the first "k" (registerWidth) bits as a zero based index */
+	index = hash >> (BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
+
+	/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
+	count = rho(hash << cState->registerWidth,
+				BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
+
+	cState->hashesArr[index] = Max(count, cState->hashesArr[index]);
+}
+
+/*
+ * Estimates cardinality, based on elements added so far
+ */
+double
+estimateHyperLogLog(hyperLogLogState *cState)
+{
+	double		result;
+	double		sum = 0.0;
+	int			i;
+
+	for (i = 0; i < cState->nRegisters; i++)
+	{
+		sum += 1.0 / pow(2.0, cState->hashesArr[i]);
+	}
+
+	/* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
+	result = cState->alphaMM / sum;
+
+	if (result <= (5.0 / 2.0) * cState->nRegisters)
+	{
+		/* Small range correction */
+		int 	zero_count = 0;
+
+		for (i = 0; i < cState->nRegisters; i++)
+		{
+			if (cState->hashesArr[i] == 0)
+				zero_count++;
+		}
+
+		if (zero_count != 0)
+			result = cState->nRegisters * log((double) cState->nRegisters /
+											  zero_count);
+	}
+	else if (result > (1.0 / 30.0) * POW_2_32)
+	{
+		/* Large range correction */
+		result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
+	}
+
+	return result;
+}
+
+/*
+ * Merges the estimate from one HyperLogLog state to another, returning the
+ * estimate of their union.
+ *
+ * The number of registers in each must match.
+ */
+void
+mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState)
+{
+	int		r;
+
+	if (cState->nRegisters != oState->nRegisters)
+		elog(ERROR, "number of registers mismatch: %zu != %zu",
+			 cState->nRegisters, oState->nRegisters);
+
+	for (r = 0; r < cState->nRegisters; ++r)
+	{
+		cState->hashesArr[r] = Max(cState->hashesArr[r], oState->hashesArr[r]);
+	}
+}
+
+
+/*
+ * Worker for addHyperLogLog().
+ *
+ * Calculates the position of the first set bit in first b bits of x argument
+ * starting from the first, reading from most significant to least significant
+ * bits.
+ *
+ * Example (when considering fist 10 bits of x):
+ *
+ * rho(x = 0b1000000000)   returns 1
+ * rho(x = 0b0010000000)   returns 3
+ * rho(x = 0b0000000000)   returns b + 1
+ *
+ * "The binary address determined by the first b bits of x"
+ *
+ * Return value "j" used to index bit pattern to watch.
+ */
+static inline uint8
+rho(uint32 x, uint8 b)
+{
+	uint8	j = 1;
+
+	while (j <= b && !(x & 0x80000000))
+	{
+		j++;
+		x <<= 1;
+	}
+
+	return j;
+}
diff --git a/src/backend/utils/adt/orderedsetaggs.c b/src/backend/utils/adt/orderedsetaggs.c
index 135a14b..9d7c71f 100644
--- a/src/backend/utils/adt/orderedsetaggs.c
+++ b/src/backend/utils/adt/orderedsetaggs.c
@@ -266,7 +266,13 @@ ordered_set_startup(FunctionCallInfo fcinfo, bool use_tuples)
 	osastate->qstate = qstate;
 	osastate->gcontext = gcontext;
 
-	/* Initialize tuplesort object */
+	/*
+	 * Initialize tuplesort object.
+	 *
+	 * In the future, we should consider forcing the tuplesort_begin_heap()
+	 * case when the abbreviated key optimization can thereby be used, even
+	 * when !use_tuples.
+	 */
 	if (use_tuples)
 		osastate->sortstate = tuplesort_begin_heap(qstate->tupdesc,
 												   qstate->numSortCols,
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index b3f397e..34f607d 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -17,9 +17,11 @@
 #include <ctype.h>
 #include <limits.h>
 
+#include "access/hash.h"
 #include "access/tuptoaster.h"
 #include "catalog/pg_collation.h"
 #include "catalog/pg_type.h"
+#include "lib/hyperloglog.h"
 #include "libpq/md5.h"
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
@@ -32,6 +34,9 @@
 #include "utils/pg_locale.h"
 #include "utils/sortsupport.h"
 
+#ifdef DEBUG_ABBREV_KEYS
+#define DEBUG_elog_output	DEBUG1
+#endif
 
 /* GUC variable */
 int			bytea_output = BYTEA_OUTPUT_HEX;
@@ -54,10 +59,12 @@ typedef struct
 
 typedef struct
 {
-	char			   *buf1;		/* 1st string */
-	char			   *buf2;		/* 2nd string */
+	char			   *buf1;		/* 1st string, or abbreviation original string buf */
+	char			   *buf2;		/* 2nd string, or abbreviation strxfrm() buf */
 	int					buflen1;
 	int					buflen2;
+	hyperLogLogState	abbr_card;	/* Abbreviated key cardinality state */
+	hyperLogLogState	full_card;	/* Full key cardinality state */
 #ifdef HAVE_LOCALE_T
 	pg_locale_t locale;
 #endif
@@ -78,6 +85,9 @@ typedef struct
 static void btsortsupport_worker(SortSupport ssup, Oid collid);
 static int bttextfastcmp_c(Datum x, Datum y, SortSupport ssup);
 static int bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup);
+static int bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup);
+static Datum bttext_abbrev_convert(Datum original, SortSupport ssup);
+static bool bttext_abbrev_abort(int memtupcount, SortSupport ssup);
 static int32 text_length(Datum str);
 static text *text_catenate(text *t1, text *t2);
 static text *text_substring(Datum str,
@@ -1736,26 +1746,53 @@ btsortsupport_worker(SortSupport ssup, Oid collid)
 	TextSortSupport	   *tss;
 
 	/*
-	 * If LC_COLLATE = C, we can make things quite a bit faster by using
-	 * memcmp() rather than strcoll().  To minimize the per-comparison
-	 * overhead, we make this decision just once for the whole sort.
-	 */
-	if (lc_collate_is_c(collid))
-	{
-		ssup->comparator = bttextfastcmp_c;
-		return;
-	}
-
-	/*
 	 * WIN32 requires complex hacks when the database encoding is UTF-8 (except
 	 * when using the "C" collation).  For now, we don't optimize that case.
 	 */
 #ifdef WIN32
-	if (GetDatabaseEncoding() == PG_UTF8)
+	if (GetDatabaseEncoding() == PG_UTF8 && !lc_collate_is_c(collid))
 		return;
 #endif
 
 	/*
+	 * On platforms where the abbreviated key for text optimization might have
+	 * bad worst case performance, it may be useful to avoid it entirely by
+	 * disabling it at compile time.  Having only 4 byte datums could make
+	 * worst-case performance drastically more likely, for example.  Moreover,
+	 * Darwin's strxfrm() implementations is known to not effectively
+	 * concentrate a significant amount of entropy from the original string in
+	 * earlier transformed blobs.  It's possible that other supported platforms
+	 * are similarly encumbered.
+	 *
+	 * Any reasonable implementation will pack primary weights into the start
+	 * of returned blobs.  The canonical algorithm's implementation is
+	 * discussed by Unicode Technical Standard #10 ("UNICODE COLLATION
+	 * ALGORITHM"), section 4, "Main algorithm".  Section 4.3, "Form Sort Key"
+	 * is of particular interest:
+	 *
+	 * http://www.unicode.org/reports/tr10/#Step_3
+	 *
+	 * The collation algorithm standard goes on to state:
+	 *
+	 * "By default, the algorithm makes use of three fully-customizable levels.
+	 * For the Latin script, these levels correspond roughly to:
+	 *
+	 * alphabetic ordering
+	 *
+	 * diacritic ordering
+	 *
+	 * case ordering.
+	 *
+	 * A final level may be used for tie-breaking between strings not otherwise
+	 * distinguished."
+	 *
+	 * It is generally expected that most non-equal keys will have their
+	 * comparisons resolved at the primary level.  If enough comparisons can be
+	 * resolved with just 4 or 8 byte abbreviated keys, this optimization is
+	 * very effective (although if there are many tie-breakers that largely
+	 * only perform cheap memcmp() calls, that is also much faster than the
+	 * unoptimized case - see bttext_abbrev_abort()).
+	 *
 	 * We may need a collation-sensitive comparison.  To make things faster,
 	 * we'll figure out the collation based on the locale id and cache the
 	 * result.  Also, since strxfrm()/strcoll() require NUL-terminated inputs,
@@ -1788,13 +1825,47 @@ btsortsupport_worker(SortSupport ssup, Oid collid)
 #endif
 	}
 
-	tss->buf1 = palloc(TEXTBUFLEN);
-	tss->buflen1 = TEXTBUFLEN;
-	tss->buf2 = palloc(TEXTBUFLEN);
-	tss->buflen2 = TEXTBUFLEN;
+	/*
+	 * If LC_COLLATE = C, we can make things quite a bit faster by using
+	 * memcmp() rather than strcoll().  To minimize the per-comparison
+	 * overhead, we make this decision just once for the whole sort.
+	 *
+	 * There is no reason to not at least perform fmgr elision on builds where
+	 * abbreviation is disabled.
+	 */
+	if (lc_collate_is_c(collid))
+		ssup->abbrev_full_comparator = ssup->comparator = bttextfastcmp_c;
+	else
+		ssup->abbrev_full_comparator = ssup->comparator = bttextfastcmp_locale;
+
+	if (!lc_collate_is_c(collid) || ssup->abbreviate)
+	{
+		/*
+		 * Abbreviated case requires temp buffers for strxfrm() copying.
+		 * bttextfastcmp_locale() also uses these buffers (even if abbreviation
+		 * isn't used), while bttextfast_c() does not.
+		 */
+		tss->buf1 = palloc(TEXTBUFLEN);
+		tss->buflen1 = TEXTBUFLEN;
+		tss->buf2 = palloc(TEXTBUFLEN);
+		tss->buflen2 = TEXTBUFLEN;
+		ssup->ssup_extra = tss;
+	}
+
+	if (!ssup->abbreviate)
+		return;
 
-	ssup->ssup_extra = tss;
-	ssup->comparator = bttextfastcmp_locale;
+	initHyperLogLog(&tss->abbr_card, 10);
+	initHyperLogLog(&tss->full_card, 10);
+
+	/*
+	 * Change comparator to be abbreviation-based -- abbreviated version will
+	 * probably ultimately be used during sorting proper, but core code may
+	 * switch back to authoritative comparator should abbreviation be aborted
+	 */
+	ssup->comparator = bttextcmp_abbrev;
+	ssup->abbrev_converter = bttext_abbrev_convert;
+	ssup->abbrev_abort = bttext_abbrev_abort;
 }
 
 /*
@@ -1903,6 +1974,225 @@ done:
 	return result;
 }
 
+/*
+ * Abbreviated key comparison func
+ */
+static int
+bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup)
+{
+	char   *a = (char *) &x;
+	char   *b = (char *) &y;
+	int 	result;
+
+	result = memcmp(a, b, sizeof(Datum));
+
+	/*
+	 * When result = 0, the core system will call bttextfastcmp_c() or
+	 * bttextfastcmp_locale().  Even a strcmp() on two non-truncated strxfrm()
+	 * blobs cannot indicate *equality* authoritatively, for the same reason
+	 * that there is a strcoll() tie-breaker call to strcmp() in varstr_cmp().
+	 */
+	return result;
+}
+
+/*
+ * Conversion routine for sortsupport.  Converts original text to abbreviated
+ * key representation.  Our encoding strategy is simple -- pack the first 8
+ * bytes of a strxfrm() blob into a Datum.
+ */
+static Datum
+bttext_abbrev_convert(Datum original, SortSupport ssup)
+{
+	TextSortSupport	   *tss = (TextSortSupport *) ssup->ssup_extra;
+	text			   *authoritative = DatumGetTextPP(original);
+
+	/* working state */
+	Datum				res;
+	char			   *pres;
+	int					len;
+	Size				bsize;
+	uint32				hash;
+
+	/*
+	 * Abbreviated key representation is a pass-by-value Datum that is treated
+	 * as a char array by the specialized comparator bttextcmp_abbrev().
+	 */
+	pres = (char *) &res;
+	/* memset(), so any non-overwritten bytes are NUL */
+	memset(pres, 0, sizeof(Datum));
+	len = VARSIZE_ANY_EXHDR(authoritative);
+
+	/* By convention, we use buffer 1 to store and NUL-terminate text */
+	if (len >= tss->buflen1)
+	{
+		pfree(tss->buf1);
+		tss->buflen1 = Max(len + 1, Min(tss->buflen1 * 2, MaxAllocSize));
+		tss->buf1 = palloc(tss->buflen1);
+	}
+
+	/* Just like strcoll(), strxfrm() expects a NUL-terminated string */
+	memcpy(tss->buf1, VARDATA_ANY(authoritative), len);
+	tss->buf1[len] = '\0';
+
+	/* Don't leak memory here */
+	if (PointerGetDatum(authoritative) != original)
+		pfree(authoritative);
+
+retry:
+
+	/*
+	 * There is no special handling of the C locale here, unlike with
+	 * varstr_cmp().  strxfrm() is used indifferently.
+	 */
+#ifdef HAVE_LOCALE_T
+	if (tss->locale)
+		bsize = strxfrm_l(tss->buf2, tss->buf1, tss->buflen2, tss->locale);
+	else
+#endif
+		bsize = strxfrm(tss->buf2, tss->buf1, tss->buflen2);
+
+	if (bsize >= tss->buflen2)
+	{
+		/*
+		 * The C standard states that the contents of the buffer is now
+		 * unspecified.  Grow buffer, and retry.
+		 */
+		pfree(tss->buf2);
+		tss->buflen2 = Max(bsize + 1, Min(tss->buflen2 * 2, MaxAllocSize));
+		tss->buf2 = palloc(tss->buflen2);
+		goto retry;
+	}
+
+	/*
+	 * Maintain approximate cardinality of both abbreviated keys and original,
+	 * authoritative keys using HyperLogLog.  Used as cheap insurance against
+	 * the worst case, where we do many string transformations for no saving in
+	 * full strcoll()-based comparisons.  These statistics are used by
+	 * bttext_abbrev_abort().
+	 *
+	 * First, Hash key proper, or a significant fraction of it.  Mix in length
+	 * in order to compensate for cases where differences are past
+	 * CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
+	 */
+	hash = hash_any((unsigned char *) tss->buf1, Min(len, PG_CACHE_LINE_SIZE));
+
+	if (len > PG_CACHE_LINE_SIZE)
+		hash ^= DatumGetUInt32(hash_uint32((uint32) len));
+
+	addHyperLogLog(&tss->full_card, hash);
+
+	memcpy(pres, tss->buf2, Min(sizeof(Datum), bsize));
+
+	/* Hash abbreviated key */
+#if SIZEOF_DATUM == 8
+	{
+		uint32				lohalf,
+							hihalf;
+
+		lohalf = (uint32) res;
+		hihalf = (uint32) (res >> 32);
+		hash = hash_uint32(lohalf ^ hihalf);
+	}
+#else							/* SIZEOF_DATUM != 8 */
+	hash = hash_uint32((uint32) res);
+#endif
+
+	addHyperLogLog(&tss->abbr_card, hash);
+
+	/*
+	 * Every Datum byte is always compared.  This is safe because the strxfrm()
+	 * blob is itself NUL terminated, leaving no danger of misinterpreting any
+	 * NUL bytes not intended to be interpreted as logically representing
+	 * termination.
+	 */
+	return res;
+}
+
+/*
+ * Callback for estimating effectiveness of abbreviated key optimization, using
+ * heuristic rules.  Returns value indicating if the abbreviation optimization
+ * should be aborted, based on its projected effectiveness.
+ */
+static bool
+bttext_abbrev_abort(int memtupcount, SortSupport ssup)
+{
+	TextSortSupport	   *tss = (TextSortSupport *) ssup->ssup_extra;
+	double				abbrev_distinct, key_distinct;
+
+	Assert(ssup->abbreviate);
+
+	/* Have a little patience */
+	if (memtupcount < 20)
+		return false;
+
+	abbrev_distinct = estimateHyperLogLog(&tss->abbr_card);
+	key_distinct = estimateHyperLogLog(&tss->full_card);
+
+	/*
+	 * Clamp cardinality estimates to at least one distinct value.  While NULLs
+	 * are generally disregarded, if only NULL values were seen so far, that
+	 * might misrepresent costs if we failed to clamp.
+	 */
+	if (abbrev_distinct <= 1.0)
+		abbrev_distinct = 1.0;
+
+	if (key_distinct <= 1.0)
+		key_distinct = 1.0;
+
+	/*
+	 * In the worst case all abbreviated keys are identical, while at the same
+	 * time there are differences within full key strings not captured in
+	 * abbreviations.
+	 */
+#ifdef DEBUG_ABBREV_KEYS
+	{
+		double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
+
+		elog(DEBUG_elog_output, "abbrev_distinct after %d: %f (key_distinct: %f, norm_abbrev_card: %f)",
+			 memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card);
+	}
+#endif
+
+	/*
+	 * If the number of distinct abbreviated keys approximately matches the
+	 * number of distinct authoritative original keys, that's reason enough to
+	 * proceed.  We can win even with a very low cardinality set if most
+	 * tie-breakers only memcmp().  This is by far the most important
+	 * consideration.
+	 *
+	 * While comparisons that are resolved at the abbreviated key level are
+	 * considerably cheaper than tie-breakers resolved with memcmp(), both of
+	 * those two outcomes are so much cheaper than a full strcoll() once
+	 * sorting is underway that it doesn't seem worth it to weigh abbreviated
+	 * cardinality against the overall size of the set in order to more
+	 * accurately model costs.  Assume that an abbreviated comparison, and an
+	 * abbreviated comparison with a cheap memcmp()-based authoritative
+	 * resolution are equivalent.
+	 */
+	if (abbrev_distinct > key_distinct * 0.05)
+		return false;
+
+	/*
+	 * Abort abbreviation strategy.
+	 *
+	 * The worst case, where all abbreviated keys are identical while all
+	 * original strings differ will typically only see a regression of about
+	 * 10% in execution time for small to medium sized lists of strings.
+	 * Whereas on modern CPUs where cache stalls are the dominant cost, we can
+	 * often expect very large improvements, particularly with sets of strings
+	 * of moderately high to high abbreviated cardinality.  There is little to
+	 * lose but much to gain, which our strategy reflects.
+	 */
+#ifdef DEBUG_ABBREV_KEYS
+	elog(DEBUG_elog_output, "would have aborted abbreviation due to worst-case at %d. abbrev_distinct: %f, key_distinct: %f",
+		 memtupcount, abbrev_distinct, key_distinct);
+	/* Actually abort only when debugging is disabled */
+	return false;
+#endif
+
+	return true;
+}
+
 Datum
 text_larger(PG_FUNCTION_ARGS)
 {
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index b68a85d..4ccb766 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -150,7 +150,10 @@ bool		optimize_bounded_sort = true;
  * When sorting single Datums, the data value is represented directly by
  * datum1/isnull1.  If the datatype is pass-by-reference and isnull1 is false,
  * then datum1 points to a separately palloc'd data value that is also pointed
- * to by the "tuple" pointer; otherwise "tuple" is NULL.
+ * to by the "tuple" pointer; otherwise "tuple" is NULL.  There is one special
+ * case:  when the sort support infrastructure provides an "abbreviated key"
+ * representation, where the key is (typically) a pass by value proxy for a
+ * pass by reference type.
  *
  * While building initial runs, tupindex holds the tuple's run number.  During
  * merge passes, we re-use it to hold the input tape number that each tuple in
@@ -347,6 +350,14 @@ struct Tuplesortstate
 	SortSupport onlyKey;
 
 	/*
+	 * Additional state for managing "abbreviated key" sortsupport routines
+	 * (which currently may be used by all cases except the Datum sort case and
+	 * hash index case).  Tracks the intervals at which the optimization's
+	 * effectiveness is tested.
+	 */
+	int64		abbrevNext;		/* Tuple # at which to next check applicability */
+
+	/*
 	 * These variables are specific to the CLUSTER case; they are set by
 	 * tuplesort_begin_cluster.
 	 */
@@ -442,6 +453,7 @@ struct Tuplesortstate
 
 static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess);
 static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
+static bool consider_abort_common(Tuplesortstate *state);
 static void inittapes(Tuplesortstate *state);
 static void selectnewtape(Tuplesortstate *state);
 static void mergeruns(Tuplesortstate *state);
@@ -619,6 +631,7 @@ tuplesort_begin_heap(TupleDesc tupDesc,
 	state->readtup = readtup_heap;
 
 	state->tupDesc = tupDesc;	/* assume we need not copy tupDesc */
+	state->abbrevNext = 10;
 
 	/* Prepare SortSupport data for each column */
 	state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
@@ -634,11 +647,19 @@ tuplesort_begin_heap(TupleDesc tupDesc,
 		sortKey->ssup_collation = sortCollations[i];
 		sortKey->ssup_nulls_first = nullsFirstFlags[i];
 		sortKey->ssup_attno = attNums[i];
+		/* Convey if abbreviation optimization is applicable in principle */
+		sortKey->abbreviate = (i == 0);
 
 		PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
 	}
 
-	if (nkeys == 1)
+	/*
+	 * The "onlyKey" optimization cannot be used with abbreviated keys, since
+	 * tie-breaker comparisons may be required.  Typically, the optimization is
+	 * only of value to pass-by-value types anyway, whereas abbreviated keys
+	 * are typically only of value to pass-by-reference types.
+	 */
+	if (nkeys == 1 && !state->sortKeys->abbrev_converter)
 		state->onlyKey = state->sortKeys;
 
 	MemoryContextSwitchTo(oldcontext);
@@ -680,6 +701,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc,
 	state->copytup = copytup_cluster;
 	state->writetup = writetup_cluster;
 	state->readtup = readtup_cluster;
+	state->abbrevNext = 10;
 
 	state->indexInfo = BuildIndexInfo(indexRel);
 
@@ -719,6 +741,8 @@ tuplesort_begin_cluster(TupleDesc tupDesc,
 		sortKey->ssup_nulls_first =
 			(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
 		sortKey->ssup_attno = scanKey->sk_attno;
+		/* Convey if abbreviation optimization is applicable in principle */
+		sortKey->abbreviate = (i == 0);
 
 		AssertState(sortKey->ssup_attno != 0);
 
@@ -768,6 +792,7 @@ tuplesort_begin_index_btree(Relation heapRel,
 	state->copytup = copytup_index;
 	state->writetup = writetup_index;
 	state->readtup = readtup_index;
+	state->abbrevNext = 10;
 
 	state->heapRel = heapRel;
 	state->indexRel = indexRel;
@@ -791,6 +816,8 @@ tuplesort_begin_index_btree(Relation heapRel,
 		sortKey->ssup_nulls_first =
 			(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
 		sortKey->ssup_attno = scanKey->sk_attno;
+		/* Convey if abbreviation optimization is applicable in principle */
+		sortKey->abbreviate = (i == 0);
 
 		AssertState(sortKey->ssup_attno != 0);
 
@@ -883,6 +910,13 @@ tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
 	state->onlyKey->ssup_cxt = CurrentMemoryContext;
 	state->onlyKey->ssup_collation = sortCollation;
 	state->onlyKey->ssup_nulls_first = nullsFirstFlag;
+	/*
+	 * Conversion to abbreviated representation infeasible in the Datum case.
+	 * It must be possible to subsequently fetch original datum values within
+	 * tuplesort_getdatum(), which would require special-case preservation of
+	 * original values.
+	 */
+	state->onlyKey->abbreviate = false;
 
 	PrepareSortSupportFromOrderingOp(sortOperator, state->onlyKey);
 
@@ -928,6 +962,19 @@ tuplesort_set_bound(Tuplesortstate *state, int64 bound)
 
 	state->bounded = true;
 	state->bound = (int) bound;
+
+	/*
+	 * Bounded sorts are not an effective target for abbreviated key
+	 * optimization.  Disable by setting state to be consistent with no
+	 * abbreviation support.
+	 */
+	state->sortKeys->abbrev_converter = NULL;
+	if (state->sortKeys->abbrev_full_comparator)
+		state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator;
+
+	/* Not strictly necessary, but be tidy */
+	state->sortKeys->abbrev_abort = NULL;
+	state->sortKeys->abbrev_full_comparator = NULL;
 }
 
 /*
@@ -1186,15 +1233,63 @@ tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel,
 {
 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
 	SortTuple	stup;
+	Datum		original;
+	IndexTuple	tuple;
 
 	stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull);
-	((IndexTuple) stup.tuple)->t_tid = *self;
+	tuple = ((IndexTuple) stup.tuple);
+	tuple->t_tid = *self;
 	USEMEM(state, GetMemoryChunkSpace(stup.tuple));
 	/* set up first-column key value */
-	stup.datum1 = index_getattr((IndexTuple) stup.tuple,
-								1,
-								RelationGetDescr(state->indexRel),
-								&stup.isnull1);
+	original = index_getattr(tuple,
+							 1,
+							 RelationGetDescr(state->indexRel),
+							 &stup.isnull1);
+
+	if (!state->sortKeys->abbrev_converter || stup.isnull1)
+	{
+		/*
+		 * Store ordinary Datum representation, or NULL value.  If there is a
+		 * converter it won't expect NULL values, and cost model is not
+		 * required to account for NULL, so in that case we avoid calling
+		 * converter and just set datum1 to "void" representation (to be
+		 * consistent).
+		 */
+		stup.datum1 = original;
+	}
+	else if (!consider_abort_common(state))
+	{
+		/* Store abbreviated key representation */
+		stup.datum1 = state->sortKeys->abbrev_converter(original,
+														state->sortKeys);
+	}
+	else
+	{
+		/* Abort abbreviation */
+		int		i;
+
+		stup.datum1 = original;
+
+		/*
+		 * Set state to be consistent with never trying abbreviation.
+		 *
+		 * Alter datum1 representation in already-copied tuples, so as to
+		 * ensure a consistent representation (current tuple was just handled).
+		 * Note that we rely on all tuples copied so far actually being
+		 * contained within memtuples array.
+		 */
+		for (i = 0; i < state->memtupcount; i++)
+		{
+			SortTuple *mtup = &state->memtuples[i];
+
+			tuple = mtup->tuple;
+			mtup->datum1 = index_getattr(tuple,
+										 1,
+										 RelationGetDescr(state->indexRel),
+										 &stup.isnull1);
+		}
+	}
+
 	puttuple_common(state, &stup);
 
 	MemoryContextSwitchTo(oldcontext);
@@ -1359,6 +1454,47 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
 	}
 }
 
+static bool
+consider_abort_common(Tuplesortstate *state)
+{
+	Assert(state->sortKeys[0].abbrev_converter != NULL);
+	Assert(state->sortKeys[0].abbrev_abort != NULL);
+	Assert(state->sortKeys[0].abbrev_full_comparator != NULL);
+
+	/*
+	 * Check effectiveness of abbreviation optimization.  Consider aborting
+	 * when still within memory limit.
+	 */
+	if (state->status == TSS_INITIAL &&
+		state->memtupcount >= state->abbrevNext)
+	{
+		state->abbrevNext *= 2;
+
+		/*
+		 * Check opclass-supplied abbreviation abort routine.  It may
+		 * indicate that abbreviation should not proceed.
+		 */
+		if (!state->sortKeys->abbrev_abort(state->memtupcount,
+										   state->sortKeys))
+			return false;
+
+		/*
+		 * Finally, restore authoritative comparator, and indicate that
+		 * abbreviation is not in play by setting abbrev_converter to NULL
+		 */
+		state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator;
+		state->sortKeys[0].abbrev_converter = NULL;
+		/* Not strictly necessary, but be tidy */
+		state->sortKeys[0].abbrev_abort = NULL;
+		state->sortKeys[0].abbrev_full_comparator = NULL;
+
+		/* Give up - expect original pass-by-value representation */
+		return true;
+	}
+
+	return false;
+}
+
 /*
  * All tuples have been provided; finish the sort.
  */
@@ -2853,6 +2989,12 @@ comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
 	TupleDesc	tupDesc;
 	int			nkey;
 	int32		compare;
+	AttrNumber	attno;
+	Datum		datum1,
+				datum2;
+	bool		isnull1,
+				isnull2;
+
 
 	/* Compare the leading sort key */
 	compare = ApplySortComparator(a->datum1, a->isnull1,
@@ -2867,14 +3009,25 @@ comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
 	rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
 	rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET);
 	tupDesc = state->tupDesc;
+
+	if (sortKey->abbrev_converter)
+	{
+		attno = sortKey->ssup_attno;
+
+		datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
+		datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
+
+		compare = ApplySortAbbrevFullComparator(datum1, isnull1,
+												datum2, isnull2,
+												sortKey);
+		if (compare != 0)
+			return compare;
+	}
+
 	sortKey++;
 	for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++)
 	{
-		AttrNumber	attno = sortKey->ssup_attno;
-		Datum		datum1,
-					datum2;
-		bool		isnull1,
-					isnull2;
+		attno = sortKey->ssup_attno;
 
 		datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
 		datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
@@ -2897,6 +3050,7 @@ copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup)
 	 * MinimalTuple using the exported interface for that.
 	 */
 	TupleTableSlot *slot = (TupleTableSlot *) tup;
+	Datum			original;
 	MinimalTuple tuple;
 	HeapTupleData htup;
 
@@ -2907,10 +3061,58 @@ copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup)
 	/* set up first-column key value */
 	htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
 	htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
-	stup->datum1 = heap_getattr(&htup,
-								state->sortKeys[0].ssup_attno,
-								state->tupDesc,
-								&stup->isnull1);
+	original = heap_getattr(&htup,
+							state->sortKeys[0].ssup_attno,
+							state->tupDesc,
+							&stup->isnull1);
+
+	if (!state->sortKeys->abbrev_converter || stup->isnull1)
+	{
+		/*
+		 * Store ordinary Datum representation, or NULL value.  If there is a
+		 * converter it won't expect NULL values, and cost model is not
+		 * required to account for NULL, so in that case we avoid calling
+		 * converter and just set datum1 to "void" representation (to be
+		 * consistent).
+		 */
+		stup->datum1 = original;
+	}
+	else if (!consider_abort_common(state))
+	{
+		/* Store abbreviated key representation */
+		stup->datum1 = state->sortKeys->abbrev_converter(original,
+														 state->sortKeys);
+	}
+	else
+	{
+		/* Abort abbreviation */
+		int		i;
+
+		stup->datum1 = original;
+
+		/*
+		 * Set state to be consistent with never trying abbreviation.
+		 *
+		 * Alter datum1 representation in already-copied tuples, so as to
+		 * ensure a consistent representation (current tuple was just handled).
+		 * Note that we rely on all tuples copied so far actually being
+		 * contained within memtuples array.
+		 */
+		for (i = 0; i < state->memtupcount; i++)
+		{
+			SortTuple *mtup = &state->memtuples[i];
+
+			htup.t_len = ((MinimalTuple) mtup->tuple)->t_len +
+							MINIMAL_TUPLE_OFFSET;
+			htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple -
+							MINIMAL_TUPLE_OFFSET);
+
+			mtup->datum1 = heap_getattr(&htup,
+										state->sortKeys[0].ssup_attno,
+										state->tupDesc,
+										&mtup->isnull1);
+		}
+	}
 }
 
 static void
@@ -2980,13 +3182,35 @@ comparetup_cluster(const SortTuple *a, const SortTuple *b,
 	TupleDesc	tupDesc;
 	int			nkey;
 	int32		compare;
+	Datum		datum1,
+				datum2;
+	bool		isnull1,
+				isnull2;
+	AttrNumber	leading = state->indexInfo->ii_KeyAttrNumbers[0];
+
+	/* Be prepared to compare additional sort keys */
+	ltup = (HeapTuple) a->tuple;
+	rtup = (HeapTuple) b->tuple;
+	tupDesc = state->tupDesc;
 
 	/* Compare the leading sort key, if it's simple */
-	if (state->indexInfo->ii_KeyAttrNumbers[0] != 0)
+	if (leading != 0)
 	{
 		compare = ApplySortComparator(a->datum1, a->isnull1,
 									  b->datum1, b->isnull1,
 									  sortKey);
+		if (compare != 0)
+			return compare;
+
+		if (sortKey->abbrev_converter)
+		{
+			datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1);
+			datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2);
+
+			compare = ApplySortAbbrevFullComparator(datum1, isnull1,
+													datum2, isnull2,
+													sortKey);
+		}
 		if (compare != 0 || state->nKeys == 1)
 			return compare;
 		/* Compare additional columns the hard way */
@@ -2999,22 +3223,13 @@ comparetup_cluster(const SortTuple *a, const SortTuple *b,
 		nkey = 0;
 	}
 
-	/* Compare additional sort keys */
-	ltup = (HeapTuple) a->tuple;
-	rtup = (HeapTuple) b->tuple;
-
 	if (state->indexInfo->ii_Expressions == NULL)
 	{
 		/* If not expression index, just compare the proper heap attrs */
-		tupDesc = state->tupDesc;
 
 		for (; nkey < state->nKeys; nkey++, sortKey++)
 		{
 			AttrNumber	attno = state->indexInfo->ii_KeyAttrNumbers[nkey];
-			Datum		datum1,
-						datum2;
-			bool		isnull1,
-						isnull2;
 
 			datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1);
 			datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2);
@@ -3072,17 +3287,67 @@ static void
 copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup)
 {
 	HeapTuple	tuple = (HeapTuple) tup;
+	Datum		original;
 
 	/* copy the tuple into sort storage */
 	tuple = heap_copytuple(tuple);
 	stup->tuple = (void *) tuple;
 	USEMEM(state, GetMemoryChunkSpace(tuple));
-	/* set up first-column key value, if it's a simple column */
-	if (state->indexInfo->ii_KeyAttrNumbers[0] != 0)
-		stup->datum1 = heap_getattr(tuple,
-									state->indexInfo->ii_KeyAttrNumbers[0],
-									state->tupDesc,
-									&stup->isnull1);
+	/*
+	 * set up first-column key value, and potentially abbreviate, if it's a
+	 * simple column
+	 */
+	if (state->indexInfo->ii_KeyAttrNumbers[0] == 0)
+		return;
+
+	original = heap_getattr(tuple,
+							state->indexInfo->ii_KeyAttrNumbers[0],
+							state->tupDesc,
+							&stup->isnull1);
+
+	if (!state->sortKeys->abbrev_converter || stup->isnull1)
+	{
+		/*
+		 * Store ordinary Datum representation, or NULL value.  If there is a
+		 * converter it won't expect NULL values, and cost model is not
+		 * required to account for NULL, so in that case we avoid calling
+		 * converter and just set datum1 to "void" representation (to be
+		 * consistent).
+		 */
+		stup->datum1 = original;
+	}
+	else if (!consider_abort_common(state))
+	{
+		/* Store abbreviated key representation */
+		stup->datum1 = state->sortKeys->abbrev_converter(original,
+														 state->sortKeys);
+	}
+	else
+	{
+		/* Abort abbreviation */
+		int		i;
+
+		stup->datum1 = original;
+
+		/*
+		 * Set state to be consistent with never trying abbreviation.
+		 *
+		 * Alter datum1 representation in already-copied tuples, so as to
+		 * ensure a consistent representation (current tuple was just handled).
+		 * Note that we rely on all tuples copied so far actually being
+		 * contained within memtuples array.
+		 */
+		for (i = 0; i < state->memtupcount; i++)
+		{
+			SortTuple *mtup = &state->memtuples[i];
+
+			tuple = (HeapTuple) mtup->tuple;
+			mtup->datum1 = heap_getattr(tuple,
+										state->indexInfo->ii_KeyAttrNumbers[0],
+										state->tupDesc,
+										&stup->isnull1);
+		}
+	}
 }
 
 static void
@@ -3162,6 +3427,11 @@ comparetup_index_btree(const SortTuple *a, const SortTuple *b,
 	bool		equal_hasnull = false;
 	int			nkey;
 	int32		compare;
+	Datum		datum1,
+				datum2;
+	bool		isnull1,
+				isnull2;
+
 
 	/* Compare the leading sort key */
 	compare = ApplySortComparator(a->datum1, a->isnull1,
@@ -3170,23 +3440,31 @@ comparetup_index_btree(const SortTuple *a, const SortTuple *b,
 	if (compare != 0)
 		return compare;
 
-	/* they are equal, so we only need to examine one null flag */
-	if (a->isnull1)
-		equal_hasnull = true;
-
 	/* Compare additional sort keys */
 	tuple1 = (IndexTuple) a->tuple;
 	tuple2 = (IndexTuple) b->tuple;
 	keysz = state->nKeys;
 	tupDes = RelationGetDescr(state->indexRel);
+
+	if (sortKey->abbrev_converter)
+	{
+		datum1 = index_getattr(tuple1, 1, tupDes, &isnull1);
+		datum2 = index_getattr(tuple2, 1, tupDes, &isnull2);
+
+		compare = ApplySortAbbrevFullComparator(datum1, isnull1,
+												datum2, isnull2,
+												sortKey);
+		if (compare != 0)
+			return compare;
+	}
+
+	/* they are equal, so we only need to examine one null flag */
+	if (a->isnull1)
+		equal_hasnull = true;
+
 	sortKey++;
 	for (nkey = 2; nkey <= keysz; nkey++, sortKey++)
 	{
-		Datum		datum1,
-					datum2;
-		bool		isnull1,
-					isnull2;
-
 		datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1);
 		datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2);
 
@@ -3313,6 +3591,7 @@ copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup)
 	IndexTuple	tuple = (IndexTuple) tup;
 	unsigned int tuplen = IndexTupleSize(tuple);
 	IndexTuple	newtuple;
+	Datum		original;
 
 	/* copy the tuple into sort storage */
 	newtuple = (IndexTuple) palloc(tuplen);
@@ -3320,10 +3599,54 @@ copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup)
 	USEMEM(state, GetMemoryChunkSpace(newtuple));
 	stup->tuple = (void *) newtuple;
 	/* set up first-column key value */
-	stup->datum1 = index_getattr(newtuple,
-								 1,
-								 RelationGetDescr(state->indexRel),
-								 &stup->isnull1);
+	original = index_getattr(newtuple,
+							 1,
+							 RelationGetDescr(state->indexRel),
+							 &stup->isnull1);
+
+	if (!state->sortKeys->abbrev_converter || stup->isnull1)
+	{
+		/*
+		 * Store ordinary Datum representation, or NULL value.  If there is a
+		 * converter it won't expect NULL values, and cost model is not
+		 * required to account for NULL, so in that case we avoid calling
+		 * converter and just set datum1 to "void" representation (to be
+		 * consistent).
+		 */
+		stup->datum1 = original;
+	}
+	else if (!consider_abort_common(state))
+	{
+		/* Store abbreviated key representation */
+		stup->datum1 = state->sortKeys->abbrev_converter(original,
+														 state->sortKeys);
+	}
+	else
+	{
+		/* Abort abbreviation */
+		int		i;
+
+		stup->datum1 = original;
+
+		/*
+		 * Set state to be consistent with never trying abbreviation.
+		 *
+		 * Alter datum1 representation in already-copied tuples, so as to
+		 * ensure a consistent representation (current tuple was just handled).
+		 * Note that we rely on all tuples copied so far actually being
+		 * contained within memtuples array.
+		 */
+		for (i = 0; i < state->memtupcount; i++)
+		{
+			SortTuple *mtup = &state->memtuples[i];
+
+			tuple = (IndexTuple) mtup->tuple;
+			mtup->datum1 = index_getattr(tuple,
+										 1,
+										 RelationGetDescr(state->indexRel),
+										 &stup->isnull1);
+		}
+	}
 }
 
 static void
diff --git a/src/include/lib/hyperloglog.h b/src/include/lib/hyperloglog.h
new file mode 100644
index 0000000..a6cbffc
--- /dev/null
+++ b/src/include/lib/hyperloglog.h
@@ -0,0 +1,67 @@
+/*
+ * hyperloglog.h
+ *
+ * A simple HyperLogLog cardinality estimator implementation
+ *
+ * Portions Copyright (c) 2014, PostgreSQL Global Development Group
+ *
+ * Based on Hideaki Ohno's C++ implementation.  The copyright terms of Ohno's
+ * original version (the MIT license) follow.
+ *
+ * src/include/lib/hyperloglog.h
+ */
+
+/*
+ * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the 'Software'), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef HYPERLOGLOG_H
+#define HYPERLOGLOG_H
+
+/*
+ * HyperLogLog is an approximate technique for computing the number of distinct
+ * entries in a set.  Importantly, it does this by using a fixed amount of
+ * memory.  See the 2007 paper "HyperLogLog: the analysis of a near-optimal
+ * cardinality estimation algorithm" for more.
+ *
+ * hyperLogLogState
+ *
+ *		registerWidth		register width, in bits ("k")
+ *		nRegisters			number of registers
+ *		alphaMM				alpha * m ^ 2 (see initHyperLogLog())
+ *		hashesArr			array of hashes
+ *		arrSize				size of hashesArr
+ */
+typedef struct hyperLogLogState
+{
+	uint8		registerWidth;
+	Size		nRegisters;
+	double		alphaMM;
+	uint8	   *hashesArr;
+	Size		arrSize;
+} hyperLogLogState;
+
+extern void initHyperLogLog(hyperLogLogState *cState, uint8 bwidth);
+extern void	addHyperLogLog(hyperLogLogState *cState, uint32 hash);
+extern double estimateHyperLogLog(hyperLogLogState *cState);
+extern void mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState);
+
+#endif   /* HYPERLOGLOG_H */
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index b82f0f7..7b7c618 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -214,13 +214,13 @@
 #endif
 
 /*
- * Assumed cache line size. This doesn't affect correctness, but can be
- * used for low-level optimizations. Currently, this is only used to pad
- * some data structures in xlog.c, to ensure that highly-contended fields
- * are on different cache lines. Too small a value can hurt performance due
- * to false sharing, while the only downside of too large a value is a few
- * bytes of wasted memory. The default is 128, which should be large enough
- * for all supported platforms.
+ * Assumed cache line size. This doesn't affect correctness, but can be used
+ * for low-level optimizations. Currently, this is used to pad some data
+ * structures in xlog.c, to ensure that highly-contended fields are on
+ * different cache lines. Too small a value can hurt performance due to false
+ * sharing, while the only downside of too large a value is a few bytes of
+ * wasted memory. The default is 128, which should be large enough for all
+ * supported platforms.
  */
 #define PG_CACHE_LINE_SIZE		128
 
diff --git a/src/include/utils/sortsupport.h b/src/include/utils/sortsupport.h
index faae703..4c99ed6 100644
--- a/src/include/utils/sortsupport.h
+++ b/src/include/utils/sortsupport.h
@@ -21,7 +21,12 @@
  * required to provide all of them.  The BTSORTSUPPORT function should
  * simply not set any function pointers for mechanisms it doesn't support.
  * Opclasses that provide BTSORTSUPPORT and don't provide a comparator
- * function will have a shim set up by sort support automatically.
+ * function will have a shim set up by sort support automatically.  However,
+ * opclasses that support the optional additional abbreviated key capability
+ * must always provide an authoritative comparator used to tie-break
+ * inconclusive abbreviated comparisons and also used  when aborting
+ * abbreviation.  Furthermore, a converter and abort/costing function must be
+ * provided.
  *
  * All sort support functions will be passed the address of the
  * SortSupportData struct when called, so they can use it to store
@@ -93,12 +98,96 @@ typedef struct SortSupportData
 	 * than, equal to, or greater than y.  Note that x and y are guaranteed
 	 * not null, and there is no way to return null either.  Do not return
 	 * INT_MIN, as callers are allowed to negate the result before using it.
+	 *
+	 * This may be either the authoritative comparator, or the abbreviated
+	 * comparator.  Core code may switch this over the initial preference of an
+	 * opclass support function despite originally indicating abbreviation was
+	 * applicable, by assigning the authoritative comparator back.
 	 */
 	int			(*comparator) (Datum x, Datum y, SortSupport ssup);
 
 	/*
-	 * Additional sort-acceleration functions might be added here later.
+	 * "Abbreviated key" infrastructure follows.
+	 *
+	 * All callbacks must be set by sortsupport opclasses that make use of this
+	 * optional additional infrastructure (unless for whatever reasons the
+	 * opclass doesn't proceed with abbreviation, in which case
+	 * abbrev_converter must not be set).
+	 *
+	 * This allows opclass authors to supply a conversion routine, used to
+	 * create an alternative representation of the underlying type (an
+	 * "abbreviated key").  Typically, this representation is an ad-hoc,
+	 * pass-by-value Datum format that only the opclass has knowledge of.  An
+	 * alternative comparator, used only with this alternative representation
+	 * must also be provided (which is assigned to "comparator").  This
+	 * representation is a simple approximation of the original Datum.  It must
+	 * be possible to compare datums of this representation with each other
+	 * using the supplied alternative comparator, and have any non-zero return
+	 * value be a reliable proxy for what a proper comparison would indicate.
+	 * Returning zero from the alternative comparator does not indicate
+	 * equality, as with a conventional support routine 1, though -- it
+	 * indicates that it wasn't possible to determine how the two abbreviated
+	 * values compared.  A proper comparison, using "auth_comparator"/
+	 * ApplySortComparatorFull() is therefore required.  In many cases this
+	 * results in most or all comparisons only using the cheap alternative
+	 * comparison func, which is typically implemented as code that compiles to
+	 * just a few CPU instructions.  CPU cache miss penalties are expensive; to
+	 * get good overall performance, sort infrastructure must heavily weigh
+	 * cache performance.
+	 *
+	 * Opclass authors must consider the final cardinality of abbreviated keys
+	 * when devising an encoding scheme.  It's possible for a strategy to work
+	 * better than an alternative strategy with one usage pattern, while the
+	 * reverse might be true for another usage pattern.  All of these factors
+	 * must be considered.
 	 */
+
+	/*
+	 * "abbreviate" concerns whether or not the abbreviated key optimization is
+	 * applicable in principle (that is, the sortsupport routine needs to know
+	 * if its dealing with a key where an abbreviated representation can
+	 * usefully be packed together.  Conventionally, this is the leading
+	 * attribute key).  Note, however, that in order to determine that
+	 * abbreviation is not in play, the core code always checks whether or not
+	 * the opclass has set abbrev_converter.  This is a one way, one time
+	 * message to the opclass.
+	 */
+	bool			abbreviate;
+
+	/*
+	 * Converter to abbreviated format, from original representation.  Core
+	 * code uses this callback to convert from a pass-by-reference "original"
+	 * Datum to a pass-by-value abbreviated key Datum.  Note that original is
+	 * guaranteed NOT NULL, because it doesn't make sense to factor NULLness
+	 * into ad-hoc cost model.
+	 *
+	 * abbrev_converter is tested to see if abbreviation is in play.  Core code
+	 * may set it to NULL to indicate abbreviation should not be used (which is
+	 * something sortsupport routines need not concern themselves with).
+	 * However, sortsupport routines must not set it when it is immediately
+	 * established that abbreviation should not proceed (for abbreviation
+	 * calls, or platform-specific impediments to using abbreviation).
+	 */
+	Datum			(*abbrev_converter) (Datum original, SortSupport ssup);
+
+	/*
+	 * abbrev_abort callback allows clients to verify that the current strategy
+	 * is working out, using a sortsupport routine defined ad-hoc cost model.
+	 * If there is a lot of duplicate abbreviated keys in practice, it's useful
+	 * to be able to abandon the strategy before paying too high a cost in
+	 * conversion (perhaps certain opclass-specific adaptations are useful
+	 * too).
+	 */
+	bool			(*abbrev_abort) (int memtupcount, SortSupport ssup);
+
+	/*
+	 * Full, authoritative comparator for key that an abbreviated
+	 * representation was generated for, used when an abbreviated comparison
+	 * was inconclusive (by calling ApplySortComparatorFull()), or used to
+	 * replace "comparator" when core system ultimately decides against
+	 * abbreviation.
+	 */
+	int			(*abbrev_full_comparator) (Datum x, Datum y, SortSupport ssup);
 } SortSupportData;
 
 
@@ -110,6 +199,9 @@ typedef struct SortSupportData
 extern int ApplySortComparator(Datum datum1, bool isNull1,
 					Datum datum2, bool isNull2,
 					SortSupport ssup);
+extern int ApplySortAbbrevFullComparator(Datum datum1, bool isNull1,
+							Datum datum2, bool isNull2,
+							SortSupport ssup);
 #endif   /* !PG_USE_INLINE */
 #if defined(PG_USE_INLINE) || defined(SORTSUPPORT_INCLUDE_DEFINITIONS)
 /*
@@ -148,6 +240,44 @@ ApplySortComparator(Datum datum1, bool isNull1,
 
 	return compare;
 }
+
+/*
+ * Apply a sort comparator function and return a 3-way comparison using full,
+ * authoritative comparator.  This takes care of handling reverse-sort and
+ * NULLs-ordering properly.
+ */
+STATIC_IF_INLINE int
+ApplySortAbbrevFullComparator(Datum datum1, bool isNull1,
+							  Datum datum2, bool isNull2,
+							  SortSupport ssup)
+{
+	int			compare;
+
+	if (isNull1)
+	{
+		if (isNull2)
+			compare = 0;		/* NULL "=" NULL */
+		else if (ssup->ssup_nulls_first)
+			compare = -1;		/* NULL "<" NOT_NULL */
+		else
+			compare = 1;		/* NULL ">" NOT_NULL */
+	}
+	else if (isNull2)
+	{
+		if (ssup->ssup_nulls_first)
+			compare = 1;		/* NOT_NULL ">" NULL */
+		else
+			compare = -1;		/* NOT_NULL "<" NULL */
+	}
+	else
+	{
+		compare = (*ssup->abbrev_full_comparator) (datum1, datum2, ssup);
+		if (ssup->ssup_reverse)
+			compare = -compare;
+	}
+
+	return compare;
+}
 #endif   /*-- PG_USE_INLINE || SORTSUPPORT_INCLUDE_DEFINITIONS */
 
 /* Other functions in utils/sort/sortsupport.c */
-- 
1.9.1

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] B-Tree support function number 3 (strxfrm() optimization)

Reply via email to