From c98ece2fa6ca247a52c7f76d2d1999cc1683f34a Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@enterprisedb.com>
Date: Sat, 28 Jul 2018 01:25:12 +1200
Subject: [PATCH] Cache file sizes to avoid lseek() calls.

Use a shared invalidation counter to allow md.c to cache file
sizes for databases that reach a steady size.

XXX WIP -- highly experimental, there may be much better ideas
than this, and the memory synchronisation may not be strong enough!
Also smgrinit2 isn't a good name, obv.
---
 src/backend/storage/smgr/md.c     | 45 ++++++++++++++++++++++++++++++-
 src/backend/storage/smgr/smgr.c   | 15 +++++++++++
 src/backend/utils/init/postinit.c |  2 ++
 src/include/storage/smgr.h        |  2 ++
 4 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 86013a5c8b..38f659afd0 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -110,6 +110,8 @@ typedef struct _MdfdVec
 {
 	File		mdfd_vfd;		/* fd number in fd.c's pool */
 	BlockNumber mdfd_segno;		/* segment number, from 0 */
+	BlockNumber	nblocks;	/* cached version of number of blocks */
+	uint32		relsize_change_counter;	/* used for invalidation */
 } MdfdVec;
 
 static MemoryContext MdCxt;		/* context for all MdfdVec objects */
@@ -198,6 +200,13 @@ static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
 		   MdfdVec *seg);
 
+typedef struct MdSharedData
+{
+	/* XXX could have an array of these, and use rel OID % nelements? */
+	pg_atomic_uint32	relsize_change_counter;
+} MdSharedData;
+
+static MdSharedData *MdShared;
 
 /*
  *	mdinit() -- Initialize private state for magnetic disk storage manager.
@@ -244,6 +253,16 @@ mdinit(void)
 	}
 }
 
+void
+mdinit2(void)
+{
+	bool		found;
+
+	MdShared = ShmemInitStruct("MdShared", sizeof(MdSharedData), &found);
+	if (!found)
+		pg_atomic_init_u32(&MdShared->relsize_change_counter, 0);
+}
+
 /*
  * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
  * already created the pendingOpsTable during initialization of the startup
@@ -538,6 +557,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 						nbytes, BLCKSZ, blocknum),
 				 errhint("Check free disk space.")));
 	}
+	pg_atomic_fetch_add_u32(&MdShared->relsize_change_counter, 1);
 
 	if (!skipFsync && !SmgrIsTemp(reln))
 		register_dirty_segment(reln, forknum, v);
@@ -600,6 +620,8 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior)
 	mdfd = &reln->md_seg_fds[forknum][0];
 	mdfd->mdfd_vfd = fd;
 	mdfd->mdfd_segno = 0;
+	mdfd->nblocks = InvalidBlockNumber;
+	mdfd->relsize_change_counter = 0;
 
 	Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
 
@@ -986,6 +1008,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 		}
 		curopensegs--;
 	}
+	pg_atomic_fetch_add_u32(&MdShared->relsize_change_counter, 1);
 }
 
 /*
@@ -1950,8 +1973,24 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 static BlockNumber
 _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 {
+	uint32		relsize_change_counter = 0;
+	BlockNumber	result;
 	off_t		len;
 
+	if (MdShared)
+	{
+		/*
+		 * When acquiring a snapshot, we executed a memory barrier.  Likewise
+		 * for a relation extension lock.  Therefore we must be able to see
+		 * a value of relsize_change_counter fresh enough for our purposes.
+		 * XXX Right?  Hmm.
+		 */
+		relsize_change_counter = pg_atomic_read_u32(&MdShared->relsize_change_counter);
+		if (seg->nblocks != InvalidBlockNumber &&
+			seg->relsize_change_counter == relsize_change_counter)
+			return seg->nblocks;
+	}
+
 	len = FileSize(seg->mdfd_vfd);
 	if (len < 0)
 		ereport(ERROR,
@@ -1959,5 +1998,9 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 				 errmsg("could not seek to end of file \"%s\": %m",
 						FilePathName(seg->mdfd_vfd))));
 	/* note that this calculation will ignore any partial block at EOF */
-	return (BlockNumber) (len / BLCKSZ);
+	result = len / BLCKSZ;
+	seg->nblocks = result;
+	seg->relsize_change_counter = relsize_change_counter;
+
+	return result;
 }
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 189342ef86..06cd70cf03 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -38,6 +38,7 @@
 typedef struct f_smgr
 {
 	void		(*smgr_init) (void);	/* may be NULL */
+	void		(*smgr_init2) (void);	/* may be NULL */
 	void		(*smgr_shutdown) (void);	/* may be NULL */
 	void		(*smgr_close) (SMgrRelation reln, ForkNumber forknum);
 	void		(*smgr_create) (SMgrRelation reln, ForkNumber forknum,
@@ -69,6 +70,7 @@ static const f_smgr smgrsw[] = {
 	/* magnetic disk */
 	{
 		.smgr_init = mdinit,
+		.smgr_init2 = mdinit2,
 		.smgr_shutdown = NULL,
 		.smgr_close = mdclose,
 		.smgr_create = mdcreate,
@@ -128,6 +130,19 @@ smgrinit(void)
 	on_proc_exit(smgrshutdown, 0);
 }
 
+/* Run after shared memory is initialized. */
+void
+smgrinit2(void)
+{
+	int			i;
+
+	for (i = 0; i < NSmgr; i++)
+	{
+		if (smgrsw[i].smgr_init2)
+			smgrsw[i].smgr_init2();
+	}
+}
+
 /*
  * on_proc_exit hook for smgr cleanup during backend shutdown
  */
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 4f1d2a0d28..3813c64ddb 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -594,6 +594,8 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
 
 	elog(DEBUG3, "InitPostgres");
 
+	smgrinit2();
+
 	/*
 	 * Add my PGPROC struct to the ProcArray.
 	 *
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index c843bbc969..45e05955b5 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -81,6 +81,7 @@ typedef SMgrRelationData *SMgrRelation;
 	RelFileNodeBackendIsTemp((smgr)->smgr_rnode)
 
 extern void smgrinit(void);
+extern void smgrinit2(void);
 extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend);
 extern bool smgrexists(SMgrRelation reln, ForkNumber forknum);
 extern void smgrsetowner(SMgrRelation *owner, SMgrRelation reln);
@@ -116,6 +117,7 @@ extern void AtEOXact_SMgr(void);
 
 /* in md.c */
 extern void mdinit(void);
+extern void mdinit2(void);
 extern void mdclose(SMgrRelation reln, ForkNumber forknum);
 extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
-- 
2.19.1

