From f36581783ff1f917296beac6bd38c981b6f401d8 Mon Sep 17 00:00:00 2001
From: David Rowley <dgrowley@gmail.com>
Date: Thu, 31 Oct 2024 09:34:30 +1300
Subject: [PATCH v5 5/5] Speedup tuple deformation with additional function
 inlining

---
 src/backend/executor/execTuples.c | 210 ++++++++++++++++++++++--------
 1 file changed, 155 insertions(+), 55 deletions(-)

diff --git a/src/backend/executor/execTuples.c b/src/backend/executor/execTuples.c
index 5d81c81267..2b07926113 100644
--- a/src/backend/executor/execTuples.c
+++ b/src/backend/executor/execTuples.c
@@ -991,54 +991,40 @@ tts_buffer_heap_store_tuple(TupleTableSlot *slot, HeapTuple tuple,
 }
 
 /*
- * slot_deform_heap_tuple
- *		Given a TupleTableSlot, extract data from the slot's physical tuple
- *		into its Datum/isnull arrays.  Data is extracted up through the
- *		natts'th column (caller must ensure this is a legal column number).
+ * slot_deform_heap_tuple_internal
+ *		An always inline helper function for use in slot_deform_heap_tuple to
+ *		allow the compiler to emit specialized versions of this function for
+ *		various combinations of "slow" and "hasnulls".  For example, if a
+ *		given tuple has no nulls, then we needn't check "hasnulls" for every
+ *		attribute that we're deforming.  The caller can just call this
+ *		function with hasnulls set to constant-false and have the compiler
+ *		remove the constant-false branches and emit more optimal code.
  *
- *		This is essentially an incremental version of heap_deform_tuple:
- *		on each call we extract attributes up to the one needed, without
- *		re-computing information about previously extracted attributes.
- *		slot->tts_nvalid is the number of attributes already extracted.
+ * Returns the next attnum to deform, which can be equal to natts when the
+ * function managed to deform all requested attributes.  *offp is an input and
+ * output parameter which is the byte offset within the tuple to start deforming
+ * from which, on return, gets set to the offset where the next attribute
+ * should be deformed from.  *slowp is set to true when subsequent deforming
+ * of this tuple must use a version of this function with "slow" passed as
+ * true.
  *
- * This is marked as always inline, so the different offp for different types
- * of slots gets optimized away.
+ * Callers cannot assume when we return "attnum" (i.e. all requested
+ * attributes have been deformed) that slow mode isn't required for any
+ * additional deforming as the final attribute may have caused a switch to
+ * slow mode.
  */
-static pg_attribute_always_inline void
-slot_deform_heap_tuple(TupleTableSlot *slot, HeapTuple tuple, uint32 *offp,
-					   int natts)
+static pg_attribute_always_inline int
+slot_deform_heap_tuple_internal(TupleTableSlot *slot, HeapTuple tuple,
+								int attnum, int natts, bool slow,
+								bool hasnulls, uint32 *offp, bool *slowp)
 {
 	TupleDesc	tupleDesc = slot->tts_tupleDescriptor;
 	Datum	   *values = slot->tts_values;
 	bool	   *isnull = slot->tts_isnull;
 	HeapTupleHeader tup = tuple->t_data;
-	bool		hasnulls = HeapTupleHasNulls(tuple);
-	int			attnum;
 	char	   *tp;				/* ptr to tuple data */
-	uint32		off;			/* offset in tuple data */
 	bits8	   *bp = tup->t_bits;	/* ptr to null bitmap in tuple */
-	bool		slow;			/* can we use/set attcacheoff? */
-
-	/* We can only fetch as many attributes as the tuple has. */
-	natts = Min(HeapTupleHeaderGetNatts(tuple->t_data), natts);
-
-	/*
-	 * Check whether the first call for this tuple, and initialize or restore
-	 * loop state.
-	 */
-	attnum = slot->tts_nvalid;
-	if (attnum == 0)
-	{
-		/* Start from the first attribute */
-		off = 0;
-		slow = false;
-	}
-	else
-	{
-		/* Restore state from previous execution */
-		off = *offp;
-		slow = TTS_SLOW(slot);
-	}
+	bool		slownext = false;
 
 	tp = (char *) tup + tup->t_hoff;
 
@@ -1050,14 +1036,20 @@ slot_deform_heap_tuple(TupleTableSlot *slot, HeapTuple tuple, uint32 *offp,
 		{
 			values[attnum] = (Datum) 0;
 			isnull[attnum] = true;
-			slow = true;		/* can't use attcacheoff anymore */
-			continue;
+			if (!slow)
+			{
+				*slowp = true;
+				return attnum + 1;
+			}
+			else
+				continue;
 		}
 
 		isnull[attnum] = false;
 
+		/* calculate the offset of this attribute */
 		if (!slow && thisatt->attcacheoff >= 0)
-			off = thisatt->attcacheoff;
+			*offp = thisatt->attcacheoff;
 		else if (thisatt->attlen == -1)
 		{
 			/*
@@ -1066,31 +1058,140 @@ slot_deform_heap_tuple(TupleTableSlot *slot, HeapTuple tuple, uint32 *offp,
 			 * pad bytes in any case: then the offset will be valid for either
 			 * an aligned or unaligned value.
 			 */
-			if (!slow &&
-				off == att_nominal_alignby(off, thisatt->attalignby))
-				thisatt->attcacheoff = off;
+			if (!slow && *offp == att_nominal_alignby(*offp, thisatt->attalignby))
+				thisatt->attcacheoff = *offp;
 			else
 			{
-				off = att_pointer_alignby(off, thisatt->attalignby, -1,
-										  tp + off);
-				slow = true;
+				*offp = att_pointer_alignby(*offp,
+											thisatt->attalignby,
+											-1,
+											tp + *offp);
+
+				if (!slow)
+					slownext = true;
 			}
 		}
 		else
 		{
-			/* not varlena, so safe to use att_nominal_alignby */
-			off = att_nominal_alignby(off, thisatt->attalignby);
+			/* not varlena, so safe to use att_align_nominal */
+			*offp = att_nominal_alignby(*offp, thisatt->attalignby);
 
 			if (!slow)
-				thisatt->attcacheoff = off;
+				thisatt->attcacheoff = *offp;
+		}
+
+		values[attnum] = fetchatt(thisatt, tp + *offp);
+
+		*offp = att_addlength_pointer(*offp, thisatt->attlen, tp + *offp);
+
+		/* check if we need to switch to slow mode */
+		if (!slow)
+		{
+			/*
+			 * We're unable to deform any further if the above code set
+			 * 'slownext', or if this isn't a fixed-width attribute.
+			 */
+			if (slownext || thisatt->attlen <= 0)
+			{
+				*slowp = true;
+				return attnum + 1;
+			}
 		}
+	}
 
-		values[attnum] = fetchatt(thisatt, tp + off);
+	return natts;
+}
 
-		off = att_addlength_pointer(off, thisatt->attlen, tp + off);
+/*
+ * slot_deform_heap_tuple
+ *		Given a TupleTableSlot, extract data from the slot's physical tuple
+ *		into its Datum/isnull arrays.  Data is extracted up through the
+ *		natts'th column (caller must ensure this is a legal column number).
+ *
+ *		This is essentially an incremental version of heap_deform_tuple:
+ *		on each call we extract attributes up to the one needed, without
+ *		re-computing information about previously extracted attributes.
+ *		slot->tts_nvalid is the number of attributes already extracted.
+ *
+ * This is marked as always inline, so the different offp for different types
+ * of slots gets optimized away.
+ */
+static pg_attribute_always_inline void
+slot_deform_heap_tuple(TupleTableSlot *slot, HeapTuple tuple, uint32 *offp,
+					   int natts)
+{
+	bool		hasnulls = HeapTupleHasNulls(tuple);
+	int			attnum;
+	uint32		off;			/* offset in tuple data */
+	bool		slow;			/* can we use/set attcacheoff? */
+
+	/* We can only fetch as many attributes as the tuple has. */
+	natts = Min(HeapTupleHeaderGetNatts(tuple->t_data), natts);
 
-		if (thisatt->attlen <= 0)
-			slow = true;		/* can't use attcacheoff anymore */
+	/*
+	 * Check whether the first call for this tuple, and initialize or restore
+	 * loop state.
+	 */
+	attnum = slot->tts_nvalid;
+	if (attnum == 0)
+	{
+		/* Start from the first attribute */
+		off = 0;
+		slow = false;
+	}
+	else
+	{
+		/* Restore state from previous execution */
+		off = *offp;
+		slow = TTS_SLOW(slot);
+	}
+
+	/*
+	 * If 'slow' isn't set, try deforming using deforming code that does not
+	 * contain any of the extra checks required for non-fixed offset
+	 * deforming.  During deforming, if or when we find a NULL or a variable
+	 * length attribute, we'll switch to a deforming method which includes the
+	 * extra code required for non-fixed offset deforming, a.k.a slow mode.
+	 * Because this is performance critical, we inline
+	 * slot_deform_heap_tuple_internal passing the 'slow' and 'hasnull'
+	 * parameters as constants to allow the compiler to emit specialized code
+	 * with the known-const false comparisons and subsequent branches removed.
+	 */
+	if (!slow)
+	{
+		/* Tuple without any NULLs? We can skip doing any NULL checking */
+		if (!hasnulls)
+			attnum = slot_deform_heap_tuple_internal(slot,
+													 tuple,
+													 attnum,
+													 natts,
+													 false, /* slow */
+													 false, /* hasnulls */
+													 &off,
+													 &slow);
+		else
+			attnum = slot_deform_heap_tuple_internal(slot,
+													 tuple,
+													 attnum,
+													 natts,
+													 false, /* slow */
+													 true,	/* hasnulls */
+													 &off,
+													 &slow);
+	}
+
+	/* If there's still work to do then we must be in slow mode */
+	if (attnum < natts)
+	{
+		/* XXX is it worth adding a separate call when hasnulls is false? */
+		attnum = slot_deform_heap_tuple_internal(slot,
+												 tuple,
+												 attnum,
+												 natts,
+												 true,	/* slow */
+												 hasnulls,
+												 &off,
+												 &slow);
 	}
 
 	/*
@@ -1104,7 +1205,6 @@ slot_deform_heap_tuple(TupleTableSlot *slot, HeapTuple tuple, uint32 *offp,
 		slot->tts_flags &= ~TTS_FLAG_SLOW;
 }
 
-
 const TupleTableSlotOps TTSOpsVirtual = {
 	.base_slot_size = sizeof(VirtualTupleTableSlot),
 	.init = tts_virtual_init,
-- 
2.34.1

