From fee005e20d121fb79ba3b53d631f4772ce5896fe Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 14 Jan 2019 16:09:48 -0800
Subject: [PATCH] Avoid amcheck TOAST compression inconsistencies.

Normalize input varlena datums by decompressing compressed values.  This
avoids false positive reports of corruption in cases where the executor
input a slot containing a heap tuple that is compressed in the heap but
not an index.  For example, the executor can apply TOAST compression to
an entire tuple because it has a single external TOAST pointer.

I discovered this bug initially, but it was independently reported
shortly afterwards by Andreas Kunert.  A new test case is cribbed from
his example.

Discussion: https://postgr.es/m/CAH2-WznrVd9ie+TTJ45nDT+v2nUt6YJwQrT9SebCdQKtAvfPZw@mail.gmail.com
Discussion: https://postgr.es/m/15597-294e5d3e7f01c407@postgresql.org
---
 contrib/amcheck/expected/check_btree.out |  47 +++++++
 contrib/amcheck/sql/check_btree.sql      |  45 +++++++
 contrib/amcheck/verify_nbtree.c          | 163 +++++++++++++++++++----
 3 files changed, 232 insertions(+), 23 deletions(-)

diff --git a/contrib/amcheck/expected/check_btree.out b/contrib/amcheck/expected/check_btree.out
index e864579774..3b4044d61e 100644
--- a/contrib/amcheck/expected/check_btree.out
+++ b/contrib/amcheck/expected/check_btree.out
@@ -140,10 +140,57 @@ SELECT bt_index_parent_check('delete_test_table_pkey', true);
  
 (1 row)
 
+--
+-- BUG #15597: must not assume consistent input toasting state when forming
+-- tuple.  Bloom filter must fingerprint normalized index tuple representation.
+--
+-- Example assumes TOAST_TUPLE_TARGET of TOAST_TUPLE_THRESHOLD (default).
+--
+CREATE TABLE toast_bug
+(
+  a integer,
+  b character(255),
+  c character(255),
+  d character(255),
+  e character(25),
+  g date,
+  h date,
+  i date,
+  j character(255),
+  k character(255),
+  m character(10),
+  n character(255),
+  o character(1),
+  p character(4),
+  q character(1),
+  r integer,
+  t character(255),
+  u character(50),
+  v character(100)
+);
+CREATE INDEX i_toast_bug ON toast_bug USING btree (b, c);
+-- Heap tuple length is high enough to get us over heap tuple TOAST threshold
+-- primarily due to table having many columns.  Tuple toasting machinery will
+-- compress the biggest datum inline (or joint biggest) to get under maximum.
+-- Index tuple formation will not determine that it's worth compressing, which
+-- normalization must cope with.
+INSERT INTO toast_bug
+VALUES (
+  1, 'b', 'c', 'd', 'e',
+  '2000-01-01', '2000-01-01', NULL,
+  'j', 'k', 'm', 'n', 'o', 'p', 'q', 2, 't', 'u', 'v');
+-- Should not get false positive report of corruption:
+SELECT bt_index_check('i_toast_bug', true);
+ bt_index_check 
+----------------
+ 
+(1 row)
+
 -- cleanup
 DROP TABLE bttest_a;
 DROP TABLE bttest_b;
 DROP TABLE bttest_multi;
 DROP TABLE delete_test_table;
+DROP TABLE toast_bug;
 DROP OWNED BY bttest_role; -- permissions
 DROP ROLE bttest_role;
diff --git a/contrib/amcheck/sql/check_btree.sql b/contrib/amcheck/sql/check_btree.sql
index 7b1ab4f148..983588859a 100644
--- a/contrib/amcheck/sql/check_btree.sql
+++ b/contrib/amcheck/sql/check_btree.sql
@@ -88,10 +88,55 @@ DELETE FROM delete_test_table WHERE a > 10;
 VACUUM delete_test_table;
 SELECT bt_index_parent_check('delete_test_table_pkey', true);
 
+--
+-- BUG #15597: must not assume consistent input toasting state when forming
+-- tuple.  Bloom filter must fingerprint normalized index tuple representation.
+--
+-- Example assumes TOAST_TUPLE_TARGET of TOAST_TUPLE_THRESHOLD (default).
+--
+CREATE TABLE toast_bug
+(
+  a integer,
+  b character(255),
+  c character(255),
+  d character(255),
+  e character(25),
+  g date,
+  h date,
+  i date,
+  j character(255),
+  k character(255),
+  m character(10),
+  n character(255),
+  o character(1),
+  p character(4),
+  q character(1),
+  r integer,
+  t character(255),
+  u character(50),
+  v character(100)
+);
+CREATE INDEX i_toast_bug ON toast_bug USING btree (b, c);
+
+-- Heap tuple length is high enough to get us over heap tuple TOAST threshold
+-- primarily due to table having many columns.  Tuple toasting machinery will
+-- compress the biggest datum inline (or joint biggest) to get under maximum.
+-- Index tuple formation will not determine that it's worth compressing, which
+-- normalization must cope with.
+INSERT INTO toast_bug
+VALUES (
+  1, 'b', 'c', 'd', 'e',
+  '2000-01-01', '2000-01-01', NULL,
+  'j', 'k', 'm', 'n', 'o', 'p', 'q', 2, 't', 'u', 'v');
+
+-- Should not get false positive report of corruption:
+SELECT bt_index_check('i_toast_bug', true);
+
 -- cleanup
 DROP TABLE bttest_a;
 DROP TABLE bttest_b;
 DROP TABLE bttest_multi;
 DROP TABLE delete_test_table;
+DROP TABLE toast_bug;
 DROP OWNED BY bttest_role; -- permissions
 DROP ROLE bttest_role;
diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 1c7466c815..30fd66e94c 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -27,6 +27,7 @@
 #include "access/htup_details.h"
 #include "access/nbtree.h"
 #include "access/transam.h"
+#include "access/tuptoaster.h"
 #include "access/xact.h"
 #include "catalog/index.h"
 #include "catalog/pg_am.h"
@@ -133,6 +134,8 @@ static void bt_downlink_missing_check(BtreeCheckState *state);
 static void bt_tuple_present_callback(Relation index, HeapTuple htup,
 						  Datum *values, bool *isnull,
 						  bool tupleIsAlive, void *checkstate);
+static IndexTuple bt_toast_normalize_tuple(BtreeCheckState *state,
+						   IndexTuple itup);
 static inline bool offset_is_negative_infinity(BTPageOpaque opaque,
 							OffsetNumber offset);
 static inline bool invariant_leq_offset(BtreeCheckState *state,
@@ -908,7 +911,16 @@ bt_target_page_check(BtreeCheckState *state)
 
 		/* Fingerprint leaf page tuples (those that point to the heap) */
 		if (state->heapallindexed && P_ISLEAF(topaque) && !ItemIdIsDead(itemid))
-			bloom_add_element(state->filter, (unsigned char *) itup, tupsize);
+		{
+			IndexTuple		norm;
+
+			norm = bt_toast_normalize_tuple(state, itup);
+
+			bloom_add_element(state->filter, (unsigned char *) norm,
+							  IndexTupleSize(norm));
+			if (norm != itup)
+				pfree(norm);
+		}
 
 		/*
 		 * * High key check *
@@ -1672,35 +1684,32 @@ bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values,
 						  bool *isnull, bool tupleIsAlive, void *checkstate)
 {
 	BtreeCheckState *state = (BtreeCheckState *) checkstate;
-	IndexTuple	itup;
+	IndexTuple	itup, norm;
 
 	Assert(state->heapallindexed);
 
-	/*
-	 * Generate an index tuple for fingerprinting.
-	 *
-	 * Index tuple formation is assumed to be deterministic, and IndexTuples
-	 * are assumed immutable.  While the LP_DEAD bit is mutable in leaf pages,
-	 * that's ItemId metadata, which was not fingerprinted.  (There will often
-	 * be some dead-to-everyone IndexTuples fingerprinted by the Bloom filter,
-	 * but we only try to detect the absence of needed tuples, so that's
-	 * okay.)
-	 *
-	 * Note that we rely on deterministic index_form_tuple() TOAST
-	 * compression. If index_form_tuple() was ever enhanced to compress datums
-	 * out-of-line, or otherwise varied when or how compression was applied,
-	 * our assumption would break, leading to false positive reports of
-	 * corruption.  It's also possible that non-pivot tuples could in the
-	 * future have alternative equivalent representations (e.g. by using the
-	 * INDEX_ALT_TID_MASK bit). For now, we don't decompress/normalize toasted
-	 * values as part of fingerprinting.
-	 */
+	/* Generate an index tuple for fingerprinting */
 	itup = index_form_tuple(RelationGetDescr(index), values, isnull);
 	itup->t_tid = htup->t_self;
 
+	/*
+	 * Normalize.  It's fairly inefficient for normalization to deform and
+	 * then reform the tuple that was just formed, but in practice compressed
+	 * datums in indexes are fairly rare, so a second index_form_tuple() is
+	 * usually avoided.  Besides, normalization could need to be expanded to
+	 * do more in the future, which can be accommodated by the same simple
+	 * interface.  If index_form_tuple() was ever enhanced to compress datums
+	 * out-of-line, we'd only need to specify "read only" index_form_tuple()
+	 * access; verification itself would continue to work.  It's also possible
+	 * that non-pivot tuples could in the future have alternative logically
+	 * equivalent representations due to other factors (e.g. they could use
+	 * the INDEX_ALT_TID_MASK bit).
+	 */
+	norm = bt_toast_normalize_tuple(state, itup);
+
 	/* Probe Bloom filter -- tuple should be present */
-	if (bloom_lacks_element(state->filter, (unsigned char *) itup,
-							IndexTupleSize(itup)))
+	if (bloom_lacks_element(state->filter, (unsigned char *) norm,
+							IndexTupleSize(norm)))
 		ereport(ERROR,
 				(errcode(ERRCODE_DATA_CORRUPTED),
 				 errmsg("heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"",
@@ -1714,6 +1723,114 @@ bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values,
 
 	state->heaptuplespresent++;
 	pfree(itup);
+	if (norm != itup)
+		pfree(norm);
+}
+
+/*
+ * Normalize an index tuple for fingerprinting.
+ *
+ * In general, index tuple formation is assumed to be deterministic by
+ * heapallindexed verification, and IndexTuples are assumed immutable.  While
+ * the LP_DEAD bit is mutable in leaf pages, that's ItemId metadata, which is
+ * not fingerprinted.  Normalization is required to compensate for corner
+ * cases where the determinism assumption doesn't quite work.
+ *
+ * There is currently one such case: index_form_tuple() does not try to hide
+ * the source TOAST state of input datums.  The executor applies TOAST
+ * compression for heap tuples based on different criteria to the compression
+ * applied within btinsert()'s call to index_form_tuple(): it sometimes
+ * compresses more aggressively, leaving TOASTed heap tuple datums but
+ * uncompressed corresponding index tuple datums.  A subsequent heapallindexed
+ * verification will get a logically equivalent though bitwise unequal tuple
+ * from index_form_tuple(), since its input would now be coming from the
+ * TOASTed heap tuple directly (not some tuple slot).  False positive
+ * heapallindexed corruption reports could occur without normalizing away
+ * possible inconsistencies.
+ *
+ * Returned tuple is often caller's own original tuple.  Otherwise, it is a
+ * new representation of caller's original index tuple, palloc()'d in caller's
+ * memory context.
+ *
+ * Note: This routine is not concerned with distinctions about the
+ * representation of tuples beyond those that might break heapallindexed
+ * verification.  In particular, it won't try to normalize opclass-equal
+ * datums with potentially distinct representations where the details are
+ * known only to opclasses (e.g., btree/numeric_ops index datums will not get
+ * their display scale normalized-away here).
+ */
+static IndexTuple
+bt_toast_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
+{
+	TupleDesc	tupleDescriptor = RelationGetDescr(state->rel);
+	Datum		normalized[INDEX_MAX_KEYS];
+	bool		isnull[INDEX_MAX_KEYS];
+	bool		toast_free[INDEX_MAX_KEYS];
+	IndexTuple	reformed;
+	bool		normalizetup = false;
+	int			i;
+
+	/* Easy case: Tuple has no varlena datums at all */
+	if (!IndexTupleHasVarwidths(itup))
+		return itup;
+
+	for (i = 0; i < tupleDescriptor->natts; i++)
+	{
+		Form_pg_attribute	att;
+		struct varlena	   *attval;
+
+		att = TupleDescAttr(tupleDescriptor, i);
+
+		/* Assume untoasted/already normalized datum initially */
+		toast_free[i] = false;
+		normalized[i] = index_getattr(itup, i + 1,
+									  tupleDescriptor,
+									  &isnull[i]);
+		if (att->attbyval || att->attlen != -1 || isnull[i])
+			continue;
+
+		/*
+		 * Callers always pass a tuple that could safely be inserted into the
+		 * index without further processing, so an external varlena header
+		 * should never be encountered here
+		 */
+		attval = (struct varlena *) DatumGetPointer(normalized[i]);
+		if (VARATT_IS_EXTERNAL(attval))
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg("external varlena datum in tuple that references heap row (%u,%u) in index \"%s\"",
+							ItemPointerGetBlockNumber(&(itup->t_tid)),
+							ItemPointerGetOffsetNumber(&(itup->t_tid)),
+							RelationGetRelationName(state->rel))));
+		else if (VARATT_IS_COMPRESSED(attval))
+		{
+			normalizetup = true;
+			normalized[i] = PointerGetDatum(heap_tuple_untoast_attr(attval));
+			toast_free[i] = true;
+		}
+	}
+
+	/* Easier case: Tuple has varlena datums, none of which are compressed */
+	if (!normalizetup)
+		return itup;
+
+	/*
+	 * Hard case: Tuple had compressed varlena datums that necessitate
+	 * creating normalized version of the tuple from uncompressed input datums
+	 * (normalized input datums).  Forming a new representation of the tuple
+	 * usually though not always results in recompressing the datums that were
+	 * just decompressed.  This is rather naive, but shouldn't be necessary
+	 * too often.
+	 */
+	reformed = index_form_tuple(tupleDescriptor, normalized, isnull);
+	reformed->t_tid = itup->t_tid;
+
+	/* Cannot leak memory here */
+	for (i = 0; i < tupleDescriptor->natts; i++)
+		if (toast_free[i])
+			pfree(DatumGetPointer(normalized[i]));
+
+	return reformed;
 }
 
 /*
-- 
2.17.1