Re: Pluggable Storage - Andres's take

Heikki Linnakangas Thu, 11 Apr 2019 04:53:30 -0700

On 08/04/2019 20:37, Andres Freund wrote:

Hi,


On 2019-04-08 15:34:46 +0300, Heikki Linnakangas wrote:

There were a bunch of typos in the comments in tableam.h, see attached. Some
of the comments could use more copy-editing and clarification, I think, but
I stuck to fixing just typos and such for now.

I pushed these after adding three boring changes by pgindent. Thanks for
those!

I'd greatly welcome more feedback on the comments - I've been pretty
deep in this for so long that I don't see all of the issues anymore. And
a mild dyslexia doesn't help...

Here is another iteration on the comments. The patch is a mix ofcopy-editing and questions. The questions are marked with "HEIKKI:". Ican continue the copy-editing, if you can reply to the questions,clarifying the intention on some parts of the API. (Or feel free to pickand push any of these fixes immediately, if you prefer.)


- Heikki

diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index f7f726b5aec..bbcab9ce31a 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3638,7 +3638,7 @@ static struct config_string ConfigureNamesString[] =
 		{"default_table_access_method", PGC_USERSET, CLIENT_CONN_STATEMENT,
 			gettext_noop("Sets the default table access method for new tables."),
 			NULL,
-			GUC_IS_NAME
+			GUC_NOT_IN_SAMPLE | GUC_IS_NAME
 		},
 		&default_table_access_method,
 		DEFAULT_TABLE_ACCESS_METHOD,
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 82de4cdcf2c..8aeeba38ca2 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -57,6 +57,8 @@ typedef struct TableScanDescData *TableScanDesc;
  * pointer to this structure.  The information here must be sufficient to
  * properly initialize each new TableScanDesc as workers join the scan, and it
  * must act as a information what to scan for those workers.
+ *
+ * This is stored in dynamic shared memory, so you can't use pointers here.
  */
 typedef struct ParallelTableScanDescData
 {
@@ -64,6 +66,11 @@ typedef struct ParallelTableScanDescData
 	bool		phs_syncscan;	/* report location to syncscan logic? */
 	bool		phs_snapshot_any;	/* SnapshotAny, not phs_snapshot_data? */
 	Size		phs_snapshot_off;	/* data for snapshot */
+
+	/*
+	 * Table AM specific data follows. After the table AM specific data
+	 * comes the snapshot data, at 'phs_snapshot_off'.
+	 */
 } ParallelTableScanDescData;
 typedef struct ParallelTableScanDescData *ParallelTableScanDesc;
 
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 6fbfcb96c98..d4709563e7e 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -91,8 +91,9 @@ typedef enum TM_Result
  * xmax is the outdating transaction's XID.  If the caller wants to visit the
  * replacement tuple, it must check that this matches before believing the
  * replacement is really a match.
+ * HEIKKI: matches what? xmin, but that's specific to the heapam.
  *
- * cmax is the outdating command's CID, but only when the failure code is
+ * cmax is the outdating command's CID. Only set when the failure code is
  * TM_SelfModified (i.e., something in the current transaction outdated the
  * tuple); otherwise cmax is zero.  (We make this restriction because
  * HeapTupleHeaderGetCmax doesn't work for tuples outdated in other
@@ -133,7 +134,11 @@ typedef void (*IndexBuildCallback) (Relation index,
  * returned by FormData_pg_am.amhandler.
  *
  * In most cases it's not appropriate to call the callbacks directly, use the
- * table_* wrapper functions instead.
+ * table_* wrapper functions instead. The descriptions of the callbacks here
+ * are written from the AM implementor's point of view. For more information
+ * on how to call them, see the wrapper functions. (If you update the comments
+ * on either a callback or a wrapper function, remember to also update the other
+ * one!)
  *
  * GetTableAmRoutine() asserts that required callbacks are filled in, remember
  * to update when adding a callback.
@@ -179,6 +184,12 @@ typedef struct TableAmRoutine
 	 *
 	 * if temp_snap is true, the snapshot will need to be deallocated at
 	 * scan_end.
+	 *
+	 * HEIKKI: table_scan_update_snapshot() changes the snapshot. That's
+	 * a bit surprising for the AM, no? Can it be called when a scan is
+	 * already in progress?
+	 *
+	 * HEIKKI: A flags bitmask argument would be more readable than 6 booleans
 	 */
 	TableScanDesc (*scan_begin) (Relation rel,
 								 Snapshot snapshot,
@@ -194,6 +205,9 @@ typedef struct TableAmRoutine
 	/*
 	 * Release resources and deallocate scan. If TableScanDesc.temp_snap,
 	 * TableScanDesc.rs_snapshot needs to be unregistered.
+	 *
+	 * HEIKKI: I find this 'temp_snap' thing pretty weird. Can't the caller handle
+	 * deregistering it?
 	 */
 	void		(*scan_end) (TableScanDesc scan);
 
@@ -221,6 +235,11 @@ typedef struct TableAmRoutine
 	/*
 	 * Estimate the size of shared memory needed for a parallel scan of this
 	 * relation. The snapshot does not need to be accounted for.
+	 *
+	 * HEIKKI: If this returns X, then the parallelscan_initialize() call
+	 * mustn't use more than X. So this is not just for optimization purposes,
+	 * for example. Not sure how to phrase that, but could use some
+	 * clarification.
 	 */
 	Size		(*parallelscan_estimate) (Relation rel);
 
@@ -228,6 +247,8 @@ typedef struct TableAmRoutine
 	 * Initialize ParallelTableScanDesc for a parallel scan of this relation.
 	 * `pscan` will be sized according to parallelscan_estimate() for the same
 	 * relation.
+	 *
+	 * HEIKKI: What does this return?
 	 */
 	Size		(*parallelscan_initialize) (Relation rel,
 											ParallelTableScanDesc pscan);
@@ -246,18 +267,22 @@ typedef struct TableAmRoutine
 	 */
 
 	/*
-	 * Prepare to fetch tuples from the relation, as needed when fetching
-	 * tuples for an index scan.  The callback has to return an
-	 * IndexFetchTableData, which the AM will typically embed in a larger
-	 * structure with additional information.
+	 * Prepare to fetch tuples from the relation, for an index scan.  The
+	 * callback has to return an IndexFetchTableData, which the AM will
+	 * typically embed in a larger structure with additional information.
 	 *
-	 * Tuples for an index scan can then be fetched via index_fetch_tuple.
+	 * After this, the caller will call index_fetch_tuple(), as many times as
+	 * needed, to fetch the tuples.
 	 */
 	struct IndexFetchTableData *(*index_fetch_begin) (Relation rel);
 
 	/*
 	 * Reset index fetch. Typically this will release cross index fetch
 	 * resources held in IndexFetchTableData.
+	 *
+	 * HEIKKI: Is this called between every call to index_fetch_tuple()?
+	 * Between every call to index_fetch_tuple(), except when call_again is
+	 * set? Can it be a no-op?
 	 */
 	void		(*index_fetch_reset) (struct IndexFetchTableData *data);
 
@@ -272,19 +297,22 @@ typedef struct TableAmRoutine
 	 * test, return true, false otherwise.
 	 *
 	 * Note that AMs that do not necessarily update indexes when indexed
-	 * columns do not change, need to return the current/correct version of
+	 * columns don't change, need to return the current/correct version of
 	 * the tuple that is visible to the snapshot, even if the tid points to an
 	 * older version of the tuple.
 	 *
 	 * *call_again is false on the first call to index_fetch_tuple for a tid.
-	 * If there potentially is another tuple matching the tid, *call_again
-	 * needs be set to true by index_fetch_tuple, signalling to the caller
+	 * If there potentially is another tuple matching the tid, the callback
+	 * needs to set *call_again to true, signalling to the caller
 	 * that index_fetch_tuple should be called again for the same tid.
 	 *
 	 * *all_dead, if all_dead is not NULL, should be set to true by
 	 * index_fetch_tuple iff it is guaranteed that no backend needs to see
-	 * that tuple. Index AMs can use that do avoid returning that tid in
+	 * that tuple. Index AMs can use that to avoid returning that tid in
 	 * future searches.
+	 *
+	 * HEIKKI: Should the snapshot be given in index_fetch_begin()? Can it
+	 * differ between calls?
 	 */
 	bool		(*index_fetch_tuple) (struct IndexFetchTableData *scan,
 									  ItemPointer tid,
@@ -302,6 +330,8 @@ typedef struct TableAmRoutine
 	 * Fetch tuple at `tid` into `slot`, after doing a visibility test
 	 * according to `snapshot`. If a tuple was found and passed the visibility
 	 * test, returns true, false otherwise.
+	 *
+	 * HEIKKI: explain how this differs from index_fetch_tuple.
 	 */
 	bool		(*tuple_fetch_row_version) (Relation rel,
 											ItemPointer tid,
@@ -311,14 +341,17 @@ typedef struct TableAmRoutine
 	/*
 	 * Return the latest version of the tuple at `tid`, by updating `tid` to
 	 * point at the newest version.
+	 *
+	 * HEIKKI: the latest version visible to the snapshot?
 	 */
 	void		(*tuple_get_latest_tid) (Relation rel,
 										 Snapshot snapshot,
 										 ItemPointer tid);
 
 	/*
-	 * Does the tuple in `slot` satisfy `snapshot`?  The slot needs to be of
-	 * the appropriate type for the AM.
+	 * Does the tuple in `slot` satisfy `snapshot`?
+	 *
+	 * The AM may modify the data underlying the tuple as a side-effect.
 	 */
 	bool		(*tuple_satisfies_snapshot) (Relation rel,
 											 TupleTableSlot *slot,
@@ -413,8 +446,8 @@ typedef struct TableAmRoutine
 	 */
 
 	/*
-	 * This callback needs to create a new relation filenode for `rel`, with
-	 * appropriate durability behaviour for `persistence`.
+	 * Create a new relation filenode for `rel`, with appropriate durability
+	 * behaviour for `persistence`.
 	 *
 	 * On output *freezeXid, *minmulti must be set to the values appropriate
 	 * for pg_class.{relfrozenxid, relminmxid}. For AMs that don't need those
@@ -429,24 +462,40 @@ typedef struct TableAmRoutine
 											  MultiXactId *minmulti);
 
 	/*
-	 * This callback needs to remove all contents from `rel`'s current
-	 * relfilenode. No provisions for transactional behaviour need to be made.
-	 * Often this can be implemented by truncating the underlying storage to
-	 * its minimal size.
-	 *
-	 * See also table_relation_nontransactional_truncate().
+	 * Remove all rows from `rel`'s current relfilenode. No provisions for
+	 * transactional behaviour need to be made. Often this can be implemented
+	 * by truncating the underlying storage to its minimal size.
 	 */
 	void		(*relation_nontransactional_truncate) (Relation rel);
 
 	/*
-	 * See table_relation_copy_data().
+	 * Copy data from `rel` into the new relfilenode `newrnode`. The new
+	 * relfilenode might not have storage associated with it before this
+	 * callback is called. This is used for low level operations like
+	 * changing a relation's tablespace.
 	 *
 	 * This can typically be implemented by directly copying the underlying
 	 * storage, unless it contains references to the tablespace internally.
 	 */
 	void		(*relation_copy_data) (Relation rel, RelFileNode newrnode);
 
-	/* See table_relation_copy_for_cluster() */
+	/*
+	 * Copy all data from `OldHeap` into `NewHeap`, as part of a CLUSTER or
+	 * VACUUM FULL.
+	 *
+	 * If `OldIndex` is valid, the data should be ordered according to the
+	 * given index. If `use_sort` is false, the data should be fetched from the
+	 * index, otherwise it should be fetched from the old table and sorted.
+	 *
+	 * OldestXmin, FreezeXid, MultiXactCutoff are currently valid values for
+	 * the table.
+	 * HEIKKI: What does "currently valid" mean? Valid for the old table?
+	 *
+	 * The callback should set *num_tuples, *tups_vacuumed, *tups_recently_dead
+	 * to statistics computed while copying for the relation. Not all might make
+	 * sense for every AM.
+	 * HEIKKI: What to do for the ones that don't make sense? Set to 0?
+	 */
 	void		(*relation_copy_for_cluster) (Relation NewHeap,
 											  Relation OldHeap,
 											  Relation OldIndex,
@@ -466,9 +515,8 @@ typedef struct TableAmRoutine
 	 * On entry a transaction is already established, and the relation is
 	 * locked with a ShareUpdateExclusive lock.
 	 *
-	 * Note that neither VACUUM FULL (and CLUSTER), nor ANALYZE go through
-	 * this routine, even if (for ANALYZE) it is part of the same VACUUM
-	 * command.
+	 * Note that VACUUM FULL (or CLUSTER) does not use this callback.
+	 * Neither does ANALYZE, even if it is part of the same VACUUM command.
 	 *
 	 * There probably, in the future, needs to be a separate callback to
 	 * integrate with autovacuum's scheduling.
@@ -479,13 +527,13 @@ typedef struct TableAmRoutine
 
 	/*
 	 * Prepare to analyze block `blockno` of `scan`. The scan has been started
-	 * with table_beginscan_analyze().  See also
-	 * table_scan_analyze_next_block().
+	 * with table_beginscan_analyze().
 	 *
 	 * The callback may acquire resources like locks that are held until
-	 * table_scan_analyze_next_tuple() returns false. It e.g. can make sense
+	 * table_scan_analyze_next_tuple() returns false. For example, it can make sense
 	 * to hold a lock until all tuples on a block have been analyzed by
 	 * scan_analyze_next_tuple.
+	 * HEIKKI: Hold a lock on what? A lwlock on the page?
 	 *
 	 * The callback can return false if the block is not suitable for
 	 * sampling, e.g. because it's a metapage that could never contain tuples.
@@ -589,8 +637,8 @@ typedef struct TableAmRoutine
 										   struct TBMIterateResult *tbmres);
 
 	/*
-	 * Fetch the next tuple of a bitmap table scan into `slot` and return true
-	 * if a visible tuple was found, false otherwise.
+	 * Fetch the next visible tuple of a bitmap table scan into `slot`. If a
+	 * tuple was found, returns true, false otherwise.
 	 *
 	 * For some AMs it will make more sense to do all the work referencing
 	 * `tbmres` contents in scan_bitmap_next_block, for others it might be
@@ -618,6 +666,8 @@ typedef struct TableAmRoutine
 	 * internally needs to perform mapping between the internal and a block
 	 * based representation.
 	 *
+	 * HEIKKI: What TsmRoutine? Where is that?
+	 *
 	 * Note that it's not acceptable to hold deadlock prone resources such as
 	 * lwlocks until scan_sample_next_tuple() has exhausted the tuples on the
 	 * block - the tuple is likely to be returned to an upper query node, and
@@ -632,9 +682,11 @@ typedef struct TableAmRoutine
 										   struct SampleScanState *scanstate);
 
 	/*
-	 * This callback, only called after scan_sample_next_block has returned
-	 * true, should determine the next tuple to be returned from the selected
-	 * block using the TsmRoutine's NextSampleTuple() callback.
+	 * Return the next tuple in a sample scan.
+	 *
+	 * This callback will only be called after scan_sample_next_block has
+	 * returned true. It should determine the next tuple to be returned from
+	 * the selected block using the TsmRoutine's NextSampleTuple() callback.
 	 *
 	 * The callback needs to perform visibility checks, and only return
 	 * visible tuples. That obviously can mean calling NextSampletuple()
@@ -657,15 +709,15 @@ typedef struct TableAmRoutine
  */
 
 /*
- * Returns slot callbacks suitable for holding tuples of the appropriate type
+ * Return slot callbacks suitable for holding tuples of the appropriate type
  * for the relation.  Works for tables, views, foreign tables and partitioned
  * tables.
  */
 extern const TupleTableSlotOps *table_slot_callbacks(Relation rel);
 
 /*
- * Returns slot using the callbacks returned by table_slot_callbacks(), and
- * registers it on *reglist.
+ * Return a slot using the callbacks returned by table_slot_callbacks(), and
+ * register it on *reglist.
  */
 extern TupleTableSlot *table_slot_create(Relation rel, List **reglist);
 
@@ -676,8 +728,8 @@ extern TupleTableSlot *table_slot_create(Relation rel, List **reglist);
  */
 
 /*
- * Start a scan of `rel`. Returned tuples pass a visibility test of
- * `snapshot`, and if nkeys != 0, the results are filtered by those scan keys.
+ * Start a scan of `rel`. Returned tuples are visible according to `snapshot`,
+ * and if nkeys != 0, the results are filtered by those scan keys.
  */
 static inline TableScanDesc
 table_beginscan(Relation rel, Snapshot snapshot,
@@ -688,18 +740,24 @@ table_beginscan(Relation rel, Snapshot snapshot,
 }
 
 /*
- * Like table_beginscan(), but for scanning catalog. It'll automatically use a
- * snapshot appropriate for scanning catalog relations.
+ * Like table_beginscan(), but for scanning a system catalog. It will
+ * automatically use a snapshot appropriate for scanning catalog relations.
  */
 extern TableScanDesc table_beginscan_catalog(Relation rel, int nkeys,
 						struct ScanKeyData *key);
 
 /*
  * Like table_beginscan(), but table_beginscan_strat() offers an extended API
- * that lets the caller control whether a nondefault buffer access strategy
- * can be used, and whether syncscan can be chosen (possibly resulting in the
- * scan not starting from block zero).  Both of these default to true with
- * plain table_beginscan.
+ * that lets the caller to use a non-default buffer access strategy, or
+ * specify that a synchronized scan can be used (possibly resulting in the
+ * scan not starting from block zero).  Both of these default to true, as
+ * with plain table_beginscan.
+ *
+ * HEIKKI: I'm a bit confused by 'allow_strat'. What is the non-default
+ * strategy that will get used if you pass allow_strat=true? Perhaps the flag
+ * should be called "use_bulkread_strategy"? Or it should be of type
+ * BufferAccessStrategyType, or the caller should create a strategy with
+ * GetAccessStrategy() and pass that.
  */
 static inline TableScanDesc
 table_beginscan_strat(Relation rel, Snapshot snapshot,
@@ -712,10 +770,10 @@ table_beginscan_strat(Relation rel, Snapshot snapshot,
 }
 
 /*
- * table_beginscan_bm is an alternative entry point for setting up a
- * TableScanDesc for a bitmap heap scan.  Although that scan technology is
- * really quite unlike a standard seqscan, there is just enough commonality to
- * make it worth using the same data structure.
+ * table_beginscan_bm() is an alternative entry point for setting up a
+ * TableScanDesc for a bitmap heap scan.  Although a bitmap scan is
+ * really quite unlike a standard seqscan, there is just enough commonality
+ * that it makes sense to use a TableScanDesc for both.
  */
 static inline TableScanDesc
 table_beginscan_bm(Relation rel, Snapshot snapshot,
@@ -726,11 +784,13 @@ table_beginscan_bm(Relation rel, Snapshot snapshot,
 }
 
 /*
- * table_beginscan_sampling is an alternative entry point for setting up a
+ * table_beginscan_sampling() is an alternative entry point for setting up a
  * TableScanDesc for a TABLESAMPLE scan.  As with bitmap scans, it's worth
  * using the same data structure although the behavior is rather different.
  * In addition to the options offered by table_beginscan_strat, this call
  * also allows control of whether page-mode visibility checking is used.
+ *
+ * HEIKKI: What is 'pagemode'?
  */
 static inline TableScanDesc
 table_beginscan_sampling(Relation rel, Snapshot snapshot,
@@ -973,9 +1033,6 @@ table_get_latest_tid(Relation rel, Snapshot snapshot, ItemPointer tid)
  *
  * This assumes the slot's tuple is valid, and of the appropriate type for the
  * AM.
- *
- * Some AMs might modify the data underlying the tuple as a side-effect. If so
- * they ought to mark the relevant buffer dirty.
  */
 static inline bool
 table_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot,
@@ -1004,31 +1061,33 @@ table_compute_xid_horizon_for_tuples(Relation rel,
  */
 
 /*
- * Insert a tuple from a slot into table AM routine.
+ * Insert a tuple from a slot into the table.
  *
- * The options bitmask allows to specify options that allow to change the
- * behaviour of the AM. Several options might be ignored by AMs not supporting
- * them.
+ * The options bitmask allows changing the behaviour of the AM. The AM may
+ * ignore options that it does not support.
  *
  * If the TABLE_INSERT_SKIP_WAL option is specified, the new tuple doesn't
  * need to be logged to WAL, even for a non-temp relation. It is the AMs
  * choice whether this optimization is supported.
  *
- * If the TABLE_INSERT_SKIP_FSM option is specified, AMs are free to not reuse
- * free space in the relation. This can save some cycles when we know the
+ * The TABLE_INSERT_SKIP_FSM option is a hint that the AM should not bother
+ * to try reusing free space. This can save some cycles when we know the
  * relation is new and doesn't contain useful amounts of free space.  It's
  * commonly passed directly to RelationGetBufferForTuple, see for more info.
  *
- * TABLE_INSERT_FROZEN should only be specified for inserts into
- * relfilenodes created during the current subtransaction and when
- * there are no prior snapshots or pre-existing portals open.
- * This causes rows to be frozen, which is an MVCC violation and
- * requires explicit options chosen by user.
+ * TABLE_INSERT_FROZEN means that the AM may skip normal MVCC rules for the
+ * tuples, so that they become immediately visible to everyone. That is a
+ * violation of the normal MVCC rules, and requires specific action from the
+ * user, currently only used for the COPY FREEZE option. Even then, it should
+ * only be specified for inserts into relfilenodes created during the current
+ * subtransaction and when there are no prior snapshots or pre-existing portals
+ * open.
  *
  * TABLE_INSERT_NO_LOGICAL force-disables the emitting of logical decoding
  * information for the tuple. This should solely be used during table rewrites
  * where RelationIsLogicallyLogged(relation) is not yet accurate for the new
  * relation.
+ * HEIKKI: Is this optional, too? Can the AM ignore it?
  *
  * Note that most of these options will be applied when inserting into the
  * heap's TOAST table, too, if the tuple requires any out-of-line data.
@@ -1041,6 +1100,8 @@ table_compute_xid_horizon_for_tuples(Relation rel,
  * On return the slot's tts_tid and tts_tableOid are updated to reflect the
  * insertion. But note that any toasting of fields within the slot is NOT
  * reflected in the slots contents.
+ *
+ * HEIKKI: I think GetBulkInsertState() should be an AM-specific callback.
  */
 static inline void
 table_insert(Relation rel, TupleTableSlot *slot, CommandId cid,
@@ -1089,9 +1150,8 @@ table_complete_speculative(Relation rel, TupleTableSlot *slot,
  * operation. That's often faster than calling table_insert() in a loop,
  * because e.g. the AM can reduce WAL logging and page locking overhead.
  *
- * Except for taking `nslots` tuples as input, as an array of TupleTableSlots
- * in `slots`, the parameters for table_multi_insert() are the same as for
- * table_insert().
+ * The parameters are the same as for table_insert(), except for taking
+ * an array of slots.
  *
  * Note: this leaks memory into the current memory context. You can create a
  * temporary context before calling this, if that's a problem.
@@ -1115,20 +1175,25 @@ table_multi_insert(Relation rel, TupleTableSlot **slots, int nslots,
  *	tid - TID of tuple to be deleted
  *	cid - delete command ID (used for visibility test, and stored into
  *		cmax if successful)
+ * HEIKKI: description for 'snapshot' parameter is missing
  *	crosscheck - if not InvalidSnapshot, also check tuple against this
  *	wait - true if should wait for any conflicting update to commit/abort
- * Output parameters:
- *	tmfd - filled in failure cases (see below)
  *	changingPart - true iff the tuple is being moved to another partition
  *		table due to an update of the partition key. Otherwise, false.
+ * Output parameters:
+ *	tmfd - filled in failure cases (see below)
+ *
+ * HEIKKI: What's the AM supposed to do differently if 'changingPart' is set?
+ * (I know, it's supposed to set the t_ctid to the magic "moved partition"
+ * value. Explain that)
  *
  * Normal, successful return value is TM_Ok, which means we did actually
- * delete it.  Failure return codes are TM_SelfModified, TM_Updated, and
- * TM_BeingModified (the last only possible if wait == false).
+ * delete it.  Failure return codes are TM_SelfModified, TM_Updated,
+ * TM_Deleted, and TM_BeingModified (the last only possible if wait == false).
  *
  * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
- * t_xmax, and, if possible, and, if possible, t_cmax.  See comments for
- * struct TM_FailureData for additional info.
+ * t_xmax, and, if possible, t_cmax.  See comments for struct TM_FailureData
+ * for additional info.
  */
 static inline TM_Result
 table_delete(Relation rel, ItemPointer tid, CommandId cid,
@@ -1161,8 +1226,8 @@ table_delete(Relation rel, ItemPointer tid, CommandId cid,
  *		are required for this tuple
  *
  * Normal, successful return value is TM_Ok, which means we did actually
- * update it.  Failure return codes are TM_SelfModified, TM_Updated, and
- * TM_BeingModified (the last only possible if wait == false).
+ * update it.  Failure return codes are TM_SelfModified, TM_Updated,
+ * TM_Deleted, and TM_BeingModified (the last only possible if wait == false).
  *
  * On success, the slot's tts_tid and tts_tableOid are updated to match the new
  * stored tuple; in particular, slot->tts_tid is set to the TID where the
@@ -1170,6 +1235,9 @@ table_delete(Relation rel, ItemPointer tid, CommandId cid,
  * update was done.  However, any TOAST changes in the new tuple's
  * data are not reflected into *newtup.
  *
+ * HEIKKI: There is no 'newtup'.
+ * HEIKKI: HEAP_ONLY_TUPLE is AM-specific; do the callers peek into that, currently?
+ *
  * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
  * t_xmax, and, if possible, t_cmax.  See comments for struct TM_FailureData
  * for additional info.
@@ -1233,8 +1301,8 @@ table_lock_tuple(Relation rel, ItemPointer tid, Snapshot snapshot,
 /*
  * Perform operations necessary to complete insertions made via
  * tuple_insert and multi_insert with a BulkInsertState specified. This
- * e.g. may e.g. used to flush the relation when inserting with
- * TABLE_INSERT_SKIP_WAL specified.
+ * may e.g. be used to flush the relation when the TABLE_INSERT_SKIP_WAL
+ * option was used.
  */
 static inline void
 table_finish_bulk_insert(Relation rel, int options)
@@ -1257,8 +1325,8 @@ table_finish_bulk_insert(Relation rel, int options)
  * This is used both during relation creation and various DDL operations to
  * create a new relfilenode that can be filled from scratch.
  *
- * *freezeXid, *minmulti are set to the xid / multixact horizon for the table
- * that pg_class.{relfrozenxid, relminmxid} have to be set to.
+ * The function sets *freezeXid, *minmulti to the xid / multixact horizon
+ * values that the table's pg_class.{relfrozenxid, relminmxid} have to be set to.
  */
 static inline void
 table_relation_set_new_filenode(Relation rel, char persistence,
@@ -1271,7 +1339,7 @@ table_relation_set_new_filenode(Relation rel, char persistence,
 
 /*
  * Remove all table contents from `rel`, in a non-transactional manner.
- * Non-transactional meaning that there's no need to support rollbacks. This
+ * Non-transactional, meaning that this cannot be rolled back. This is
  * commonly only is used to perform truncations for relfilenodes created in the
  * current transaction.
  */
@@ -1294,20 +1362,19 @@ table_relation_copy_data(Relation rel, RelFileNode newrnode)
 }
 
 /*
- * Copy data from `OldHeap` into `NewHeap`, as part of a CLUSTER or VACUUM
- * FULL.
+ * Copy all data from `OldHeap` into `NewHeap`, as part of a CLUSTER or
+ * VACUUM FULL.
  *
- * If `use_sort` is true, the table contents are sorted appropriate for
- * `OldIndex`; if use_sort is false and OldIndex is not InvalidOid, the data
- * is copied in that index's order; if use_sort is false and OidIndex is
- * InvalidOid, no sorting is performed.
+ * If `OldIndex` is valid, the data should be ordered according to the
+ * given index. If `use_sort` is false, the data should be fetched from the
+ * index, otherwise it should be fetched from the old table and sorted.
  *
  * OldestXmin, FreezeXid, MultiXactCutoff must be currently valid values for
  * the table.
  *
- * *num_tuples, *tups_vacuumed, *tups_recently_dead will contain statistics
- * computed while copying for the relation. Not all might make sense for every
- * AM.
+ * The function sets *num_tuples, *tups_vacuumed, *tups_recently_dead with
+ * statistics computed while copying for the relation. Not all might make sense
+ * for every AM.
  */
 static inline void
 table_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
@@ -1369,7 +1436,7 @@ table_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
  * is stored in `slot`.
  *
  * *liverows and *deadrows are incremented according to the encountered
- * tuples.
+ * tuples, if the AM has the concept of live and dead tuples.
  */
 static inline bool
 table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
@@ -1384,34 +1451,36 @@ table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
 /*
  * table_index_build_scan - scan the table to find tuples to be indexed
  *
- * This is called back from an access-method-specific index build procedure
- * after the AM has done whatever setup it needs.  The parent heap relation
- * is scanned to find tuples that should be entered into the index.  Each
- * such tuple is passed to the AM's callback routine, which does the right
- * things to add it to the new index.  After we return, the AM's index
- * build procedure does whatever cleanup it needs.
+ * This is called by the index-AM-specific index build procedure, after the
+ * index AM has done whatever setup it needs.  This function scans the parent
+ * table, to find tuples that should be entered into the index.  Each such tuple
+ * is passed to the index AM's callback routine, which does the right things to
+ * add it to the new index.  After this returns, the index AM's index build
+ * procedure does whatever cleanup it needs.
  *
  * The total count of live tuples is returned.  This is for updating pg_class
  * statistics.  (It's annoying not to be able to do that here, but we want to
  * merge that update with others; see index_update_stats.)  Note that the
  * index AM itself must keep track of the number of index tuples; we don't do
- * so here because the AM might reject some of the tuples for its own reasons,
+ * so here because the index AM might reject some of the tuples for its own reasons,
  * such as being unable to store NULLs.
  *
- * If 'progress', the PROGRESS_SCAN_BLOCKS_TOTAL counter is updated when
+ * If 'progress' is true, the PROGRESS_SCAN_BLOCKS_TOTAL counter is updated when
  * starting the scan, and PROGRESS_SCAN_BLOCKS_DONE is updated as we go along.
  *
- * A side effect is to set indexInfo->ii_BrokenHotChain to true if we detect
+ * A side effect is to set index_info->ii_BrokenHotChain to true if we detect
  * any potentially broken HOT chains.  Currently, we set this if there are any
  * RECENTLY_DEAD or DELETE_IN_PROGRESS entries in a HOT chain, without trying
  * very hard to detect whether they're really incompatible with the chain tip.
  * This only really makes sense for heap AM, it might need to be generalized
  * for other AMs later.
+ *
+ * HEIKKI: What does 'allow_sync' do?
  */
 static inline double
 table_index_build_scan(Relation heap_rel,
 					   Relation index_rel,
-					   struct IndexInfo *index_nfo,
+					   struct IndexInfo *index_info,
 					   bool allow_sync,
 					   bool progress,
 					   IndexBuildCallback callback,
@@ -1420,7 +1489,7 @@ table_index_build_scan(Relation heap_rel,
 {
 	return heap_rel->rd_tableam->index_build_range_scan(heap_rel,
 														index_rel,
-														index_nfo,
+														index_info,
 														allow_sync,
 														false,
 														progress,
@@ -1432,12 +1501,12 @@ table_index_build_scan(Relation heap_rel,
 }
 
 /*
- * As table_index_build_scan(), except that instead of scanning the complete
- * table, only the given number of blocks are scanned.  Scan to end-of-rel can
+ * As table_index_build_scan(), but instead of scanning the complete
+ * table, only the given block range is scanned.  Scan to end-of-rel can
  * be signalled by passing InvalidBlockNumber as numblocks.  Note that
  * restricting the range to scan cannot be done when requesting syncscan.
  *
- * When "anyvisible" mode is requested, all tuples visible to any transaction
+ * When 'anyvisible' mode is requested, all tuples visible to any transaction
  * are indexed and counted as live, including those inserted or deleted by
  * transactions that are still in progress.
  */

Re: Pluggable Storage - Andres's take

Reply via email to