This is my latest revision of the Sync Scan patch, and it implements the
observability as discussed with Simon.

Changes:
 * ss_report_loc() called once per hundred pages rather than once per
page
 * DEBUG messages are a little cleaner and easier to parse, for the sake
of analysis after the fact.
 * DEBUG2 reports a sync scan starting, the relation size in pages, and
the location at which the scan starts.
 * DEBUG2 reports the location of a scan every 50k pages, DEBUG3 every
5k pages (before it was 100k/10k at DEBUG3/DEBUG4, respectively).
Numbers are aligned along 5k boundaries to make analysis easier.
 * GUCs:
   * sync_seqscan_threshold: fraction of NBuffers for the threshold
   * sync_seqscan_offset: fraction of NBuffers for the offset
   * trace_sync_seqscan: will be used in final version of patch to
control DEBUG output

Sync_scan_offset may be eliminated completely if it's not shown to be
useful enough in conjunction with Simon's patch. Sync Scans are still a
big win without sync_seqscan_offset.

Sync_scan_threshold=<real> may be turned into sync_seqscan=<boolean>
with a fixed activation threshold (NBuffers/2 per Simon's suggestion).
The reason is that synchronized scans should activate at the same
threshold as Simon's scan_recycle_buffers feature. Should we make a
"#define BIG_SCAN_THRESHOLD NBuffers/2" to use for both sync_seqscan and
for scan_recycle_buffers?

Regards,
        Jeff Davis
diff -cr postgresql-8.2.3/src/backend/access/heap/heapam.c postgresql-8.2.3-syncscan/src/backend/access/heap/heapam.c
*** postgresql-8.2.3/src/backend/access/heap/heapam.c	2007-02-04 12:00:49.000000000 -0800
--- postgresql-8.2.3-syncscan/src/backend/access/heap/heapam.c	2007-03-13 23:21:27.000000000 -0700
***************
*** 65,70 ****
--- 65,279 ----
   * ----------------------------------------------------------------
   */
  
+ static BlockNumber ss_init(HeapScanDesc);
+ static int         ss_store_hint(HeapScanDesc,BlockNumber);
+ static int         ss_hash(HeapScanDesc);
+ bool Trace_sync_seqscan = false;
+ double sync_seqscan_threshold = DEFAULT_SYNC_SCAN_THRESHOLD;
+ double sync_seqscan_offset = DEFAULT_SYNC_SCAN_OFFSET;
+ 
+ /*
+  * ss_init: 
+  *
+  * This function reads the Sync Scan Hint Table 
+  * (creating it if it doesn't already exist) to 
+  * find a possible location for an already running 
+  * sequential scan on this relation.
+  *
+  * By starting a sequential scan near the location
+  * of an already running scan, we improve the chance
+  * of finding pages in cache.
+  *
+  * Also, depending on SYNC_SCAN_START_OFFSET, this
+  * function will subtract from the hint before
+  * starting the scan, in order to pick up pages that
+  * are likely to already be in cache.
+  *
+  * This function assumes that scan->rs_nblocks is 
+  * already properly set, and sets scan->rs_start_page
+  * to a value based on the hint found. Also, it sets
+  * scan->rs_hint to point to the location of the hint
+  * in the hint table.
+  */
+ static BlockNumber ss_init(HeapScanDesc scan)
+ {
+ 	ss_hint_t *hint_table;
+ 	int table_offset;
+ 	bool found;
+ 	int threshold = sync_seqscan_threshold * NBuffers;
+ 	int offset = sync_seqscan_offset * NBuffers;
+ 
+ 	/*
+ 	 * If the table is not large compared to effective_cache_size,
+ 	 * don't Sync Scan.
+ 	 */
+ 	if(scan->rs_nblocks < threshold)
+ 	{
+ 		elog(DEBUG2,"SYNC_SCAN: Table too small to sync scan");
+ 		scan->rs_start_page = 0;
+ 		return 0;
+ 	}
+ 
+ 	table_offset = ss_hash(scan);
+ 	hint_table = (ss_hint_t*)ShmemInitStruct("Sync Scan Hint Table",
+ 		SYNC_SCAN_TABLE_SIZE*sizeof(ss_hint_t),&found);
+ 			
+ 	scan->rs_hint = &hint_table[table_offset];
+ 
+ 	/*
+ 	 * If we just created the hint table for the first time,
+ 	 * initialize the table to zero and start the scan at page 0.
+ 	 */
+ 	if(!found) {
+ 		elog(DEBUG2,"SYNC_SCAN: Created Hint Table");
+ 		memset(hint_table,0,sizeof(ss_hint_t)*SYNC_SCAN_TABLE_SIZE);
+ 		scan->rs_start_page = 0;
+ 		return 0;
+ 	}
+ 
+ 	/*
+ 	 * If the hint's relid is 0, that means
+ 	 * we have not previously created a hint
+ 	 * at this location in the table.
+ 	 */
+ 	if(scan->rs_hint->relid == 0) {
+ 		elog(DEBUG2, "SYNC_SCAN: Hint empty");
+ 		scan->rs_start_page = 0;
+ 		return 0;
+ 	}
+ 
+ 	/*
+ 	 * If the relid doesn't match the one in the hint,
+ 	 * we have a hash collision.
+ 	 */
+ 	if(RelationGetRelid(scan->rs_rd) != scan->rs_hint->relid)
+ 	{
+ 		elog(DEBUG1,"SYNC_SCAN: Hash collision");
+ 		scan->rs_start_page = 0;
+ 		return 0;
+ 	}
+ 
+ 	/*
+ 	 * If the hint is not a valid block number
+ 	 * for this relation, start at 0.
+ 	 *
+ 	 * This can happen if, for instance, someone
+ 	 * TRUNCATEd the table between when the hint 
+ 	 * was set and now.
+ 	 */
+ 	if(scan->rs_hint->location < 0 || 
+ 		scan->rs_hint->location >= scan->rs_nblocks) 
+ 	{
+ 		elog(DEBUG2,"SYNC_SCAN: Hint %d out of range." \
+ 				" Relation has %d pages.",
+ 			scan->rs_hint->location,scan->rs_nblocks);
+ 		scan->rs_start_page = 0;
+ 		return 0;
+ 	}
+ 
+ 	scan->rs_start_page = scan->rs_hint->location;
+ 
+ 	/* 
+ 	 * By starting at offset earlier than the hint,
+ 	 * it's likely that all of the blocks will already be 
+ 	 * cached, and the scan will quickly catch up to the head.
+ 	 *
+ 	 * offset is a positive value that will be
+ 	 * subtracted from the hint.
+ 	 */
+ 	if(offset > scan->rs_nblocks)
+ 	{
+ 		elog(DEBUG2,"SYNC_SCAN: Relation smaller than start offset: %d",
+ 			offset);
+ 		return 0;
+ 	}
+ 
+ 	/*
+ 	 * If subtracting the offset would bring the value
+ 	 * to less than 0, we circle backwards to the end of the
+ 	 * file.
+ 	 */
+ 	if(offset > scan->rs_start_page)
+ 		scan->rs_start_page += scan->rs_nblocks;
+ 
+ 	scan->rs_start_page -= offset;
+ 
+ 	elog(DEBUG2,"SYNC_SCAN: START: OID = %d; Location = %d; Size: %d",
+ 		RelationGetRelid(scan->rs_rd),
+ 		scan->rs_start_page,scan->rs_nblocks);
+ 
+ 	return 0;
+ }
+ 
+ /* 
+  * ss_store_hint:
+  *
+  * Writes an entry in the Sync Scan Hint Table
+  * of the form (relid,blocknumber). This will
+  * overwrite any existing entry that may collide
+  * with this entry in the table.
+  *
+  * No locking is performed here. When this data is
+  * later read by ss_init(), sanity checking is 
+  * performed to ensure we don't use an invalid
+  * relation block number.
+  */
+ static int ss_store_hint(HeapScanDesc scan, BlockNumber location) 
+ {
+ 	ss_hint_t hint;
+ 	int threshold = sync_seqscan_threshold * NBuffers;
+ 	int offset = sync_seqscan_offset * NBuffers;
+ 
+ 	/*
+ 	 * Print every 100k pages to DEBUG3
+ 	 * and every 10k pages to DEBUG4.
+ 	 */
+ 	if (!(location%50000))
+ 		elog(DEBUG2,"page: %d",location);
+ 	else if (!(location%5000))
+ 		elog(DEBUG3,"page: %d",location);
+ 
+ 	/*
+ 	 * If the table is too small, don't bother
+ 	 * with Sync Scan.
+ 	 */
+ 	if(scan->rs_nblocks < threshold)
+ 		return 0;
+ 
+ 	/*
+ 	 * If this scan has been progressing for less
+ 	 * than offset pages, don't store the hint.
+ 	 */
+ 	if(location >= scan->rs_start_page)
+ 	{
+ 		if((location - scan->rs_start_page) < offset)
+ 			return 0;
+ 	}
+ 	else
+ 	{
+ 		if((location + scan->rs_nblocks - scan->rs_start_page) 
+ 			< offset)
+ 			return 0;
+ 	}
+ 	
+ 	hint.relid = RelationGetRelid(scan->rs_rd);
+ 	hint.location = location;
+ 
+ 	*scan->rs_hint = hint;
+   
+ 	return 0;
+ }
+ 
+ /*
+  * This is a simplistic function to hash
+  * the Oid of the relation for placement in
+  * the Sync Scan Hint Table
+  */
+ static int ss_hash(HeapScanDesc scan)
+ {
+ 	return RelationGetRelid(scan->rs_rd) % SYNC_SCAN_TABLE_SIZE;
+ }
+ 
  /* ----------------
   *		initscan - scan code common to heap_beginscan and heap_rescan
   * ----------------
***************
*** 81,86 ****
--- 290,300 ----
  	 */
  	scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
  
+ 	/*
+ 	 * Choose an good place to start the relation scan.
+ 	 */
+ 	ss_init(scan);
+ 
  	scan->rs_inited = false;
  	scan->rs_ctup.t_data = NULL;
  	ItemPointerSetInvalid(&scan->rs_ctup.t_self);
***************
*** 223,229 ****
  				tuple->t_data = NULL;
  				return;
  			}
! 			page = 0;			/* first page */
  			heapgetpage(scan, page);
  			lineoff = FirstOffsetNumber;		/* first offnum */
  			scan->rs_inited = true;
--- 437,447 ----
  				tuple->t_data = NULL;
  				return;
  			}
! 			/*
! 			 * start the scan at the location that we chose
! 			 * in ss_init()
! 			 */
! 			page = scan->rs_start_page;
  			heapgetpage(scan, page);
  			lineoff = FirstOffsetNumber;		/* first offnum */
  			scan->rs_inited = true;
***************
*** 364,378 ****
  		}
  
  		/*
! 		 * if we get here, it means we've exhausted the items on this page and
  		 * it's time to move to the next.
  		 */
  		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
  
  		/*
! 		 * return NULL if we've exhausted all the pages
  		 */
! 		if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks))
  		{
  			if (BufferIsValid(scan->rs_cbuf))
  				ReleaseBuffer(scan->rs_cbuf);
--- 582,611 ----
  		}
  
  		/*
! 		 * If we get here, it means we've exhausted the items on this page and
  		 * it's time to move to the next.
+ 		 *
+ 		 * For the forward scan, we need to wrap around to the beginning
+ 		 * of the relation file if we reach the end.
  		 */
  		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
  
+ 		if(backward)
+ 			page--;
+ 		else
+ 			page = (page + 1) % (scan->rs_nblocks);
+ 
+ 		if(! (page % SYNC_SCAN_REPORT_INTERVAL) )
+ 			ss_store_hint(scan,page);
+ 
  		/*
! 		 * Return NULL if we've exhausted all the pages.
! 		 * For reverse scans, that means we've reached 0. For 
! 		 * forward scans, that means we've reached the page on
! 		 * which we started.
  		 */
! 		if ((backward && (page == 0)) ||
! 			((page%(scan->rs_nblocks)) == scan->rs_start_page))
  		{
  			if (BufferIsValid(scan->rs_cbuf))
  				ReleaseBuffer(scan->rs_cbuf);
***************
*** 383,390 ****
  			return;
  		}
  
- 		page = backward ? (page - 1) : (page + 1);
- 
  		heapgetpage(scan, page);
  
  		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
--- 616,621 ----
***************
*** 450,456 ****
  				tuple->t_data = NULL;
  				return;
  			}
! 			page = 0;			/* first page */
  			heapgetpage(scan, page);
  			lineindex = 0;
  			scan->rs_inited = true;
--- 681,691 ----
  				tuple->t_data = NULL;
  				return;
  			}
! 			/*
! 			 * start the scan at the location that we chose
! 			 * in ss_init()
! 			 */
! 			page = scan->rs_start_page;
  			heapgetpage(scan, page);
  			lineindex = 0;
  			scan->rs_inited = true;
***************
*** 585,598 ****
  		}
  
  		/*
! 		 * if we get here, it means we've exhausted the items on this page and
  		 * it's time to move to the next.
  		 */
  
  		/*
! 		 * return NULL if we've exhausted all the pages
  		 */
! 		if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks))
  		{
  			if (BufferIsValid(scan->rs_cbuf))
  				ReleaseBuffer(scan->rs_cbuf);
--- 820,847 ----
  		}
  
  		/*
! 		 * If we get here, it means we've exhausted the items on this page and
  		 * it's time to move to the next.
+ 		 *
+ 		 * For the forward scan, we need to wrap around to the beginning
+ 		 * of the relation file if we reach the end.
  		 */
+ 		if(backward)
+ 			page--;
+ 		else
+ 			page = (page + 1) % (scan->rs_nblocks);
+ 
+ 		if(! (page % SYNC_SCAN_REPORT_INTERVAL) )
+ 			ss_store_hint(scan,page);
  
  		/*
! 		 * Return NULL if we've exhausted all the pages.
! 		 * For reverse scans, that means we've reached 0. For 
! 		 * forward scans, that means we've reached the page on
! 		 * which we started.
  		 */
! 		if ((backward && (page == 0)) || 
! 			((page%(scan->rs_nblocks)) == scan->rs_start_page))
  		{
  			if (BufferIsValid(scan->rs_cbuf))
  				ReleaseBuffer(scan->rs_cbuf);
***************
*** 603,609 ****
  			return;
  		}
  
- 		page = backward ? (page - 1) : (page + 1);
  		heapgetpage(scan, page);
  
  		dp = (Page) BufferGetPage(scan->rs_cbuf);
--- 852,857 ----
***************
*** 616,621 ****
--- 864,880 ----
  	}
  }
  
+ /*
+  * SyncScanShmemSize:
+  *
+  * Called by CreateSharedMemoryAndSemaphores()
+  * to find out how much room the Sync Scan Hint
+  * Table will need to occupy.
+  */
+ Size SyncScanShmemSize(void)
+ {
+ 	return SYNC_SCAN_TABLE_SIZE*sizeof(ss_hint_t);
+ }
  
  #if defined(DISABLE_COMPLEX_MACRO)
  /*
diff -cr postgresql-8.2.3/src/backend/storage/ipc/ipci.c postgresql-8.2.3-syncscan/src/backend/storage/ipc/ipci.c
*** postgresql-8.2.3/src/backend/storage/ipc/ipci.c	2006-10-15 15:04:07.000000000 -0700
--- postgresql-8.2.3-syncscan/src/backend/storage/ipc/ipci.c	2007-03-13 21:58:56.000000000 -0700
***************
*** 19,24 ****
--- 19,25 ----
  #include "access/nbtree.h"
  #include "access/subtrans.h"
  #include "access/twophase.h"
+ #include "access/heapam.h"
  #include "miscadmin.h"
  #include "pgstat.h"
  #include "postmaster/bgwriter.h"
***************
*** 110,115 ****
--- 111,117 ----
  		size = add_size(size, FreeSpaceShmemSize());
  		size = add_size(size, BgWriterShmemSize());
  		size = add_size(size, BTreeShmemSize());
+ 		size = add_size(size, SyncScanShmemSize());
  #ifdef EXEC_BACKEND
  		size = add_size(size, ShmemBackendArraySize());
  #endif
diff -cr postgresql-8.2.3/src/backend/utils/misc/guc.c postgresql-8.2.3-syncscan/src/backend/utils/misc/guc.c
*** postgresql-8.2.3/src/backend/utils/misc/guc.c	2006-11-29 06:50:07.000000000 -0800
--- postgresql-8.2.3-syncscan/src/backend/utils/misc/guc.c	2007-03-13 23:23:31.000000000 -0700
***************
*** 25,31 ****
  #include <syslog.h>
  #endif
  
! 
  #include "access/gin.h"
  #include "access/twophase.h"
  #include "access/xact.h"
--- 25,31 ----
  #include <syslog.h>
  #endif
  
! #include "access/heapam.h"
  #include "access/gin.h"
  #include "access/twophase.h"
  #include "access/xact.h"
***************
*** 758,763 ****
--- 758,773 ----
  		false, NULL, NULL
  	},
  
+ 	{
+ 		{"trace_sync_seqscan", PGC_USERSET, DEVELOPER_OPTIONS,
+ 			gettext_noop("Generates debugging output for Synchronized Scans."),
+ 			NULL,
+ 			GUC_NOT_IN_SAMPLE
+ 		},
+ 		&Trace_sync_seqscan,
+ 		false, NULL, NULL
+ 	},
+ 
  #ifdef LOCK_DEBUG
  	{
  		{"trace_locks", PGC_SUSET, DEVELOPER_OPTIONS,
***************
*** 1723,1728 ****
--- 1733,1754 ----
  		DEFAULT_GEQO_SELECTION_BIAS, MIN_GEQO_SELECTION_BIAS,
  		MAX_GEQO_SELECTION_BIAS, NULL, NULL
  	},
+ 	{
+ 		{"sync_seqscan_threshold", PGC_USERSET, QUERY_TUNING_SYNC_SEQSCAN,
+ 			gettext_noop("Minimum size of table before synchronized scanning takes effect, as a fraction of shared_buffers."),
+ 			NULL
+ 		},
+ 		&sync_seqscan_threshold,
+ 		DEFAULT_SYNC_SCAN_THRESHOLD, 0.0, 100.0, NULL, NULL
+ 	},
+ 	{
+ 		{"sync_seqscan_offset", PGC_USERSET, QUERY_TUNING_SYNC_SEQSCAN,
+ 			gettext_noop("Start synchronized scans at this offset (as a fraction of shared_buffers) before other scans."),
+ 			NULL
+ 		},
+ 		&sync_seqscan_offset,
+ 		DEFAULT_SYNC_SCAN_OFFSET, 0.0, 100.0, NULL, NULL
+ 	},
  
  	{
  		{"bgwriter_lru_percent", PGC_SIGHUP, RESOURCES,
diff -cr postgresql-8.2.3/src/include/access/heapam.h postgresql-8.2.3-syncscan/src/include/access/heapam.h
*** postgresql-8.2.3/src/include/access/heapam.h	2006-11-05 14:42:10.000000000 -0800
--- postgresql-8.2.3-syncscan/src/include/access/heapam.h	2007-03-13 23:22:11.000000000 -0700
***************
*** 25,30 ****
--- 25,49 ----
  #include "utils/rel.h"
  #include "utils/tqual.h"
  
+ /*
+  * Size of the Sync Scan Hint Table.
+  */
+ #define SYNC_SCAN_TABLE_SIZE   1000
+ 
+ /*
+  * Interval between reports of the location
+  * of the current scan, in pages.
+  */
+ #define SYNC_SCAN_REPORT_INTERVAL 100
+ 
+ #define DEFAULT_SYNC_SCAN_THRESHOLD 1.0
+ #define DEFAULT_SYNC_SCAN_OFFSET 0.0
+ 
+ extern DLLIMPORT bool Trace_sync_seqscan;
+ extern DLLIMPORT double sync_seqscan_threshold;
+ extern DLLIMPORT double sync_seqscan_offset;
+ extern Size SyncScanShmemSize(void);
+ 
  /* ----------------
   *		fastgetattr
   *
diff -cr postgresql-8.2.3/src/include/access/relscan.h postgresql-8.2.3-syncscan/src/include/access/relscan.h
*** postgresql-8.2.3/src/include/access/relscan.h	2006-10-03 17:30:07.000000000 -0700
--- postgresql-8.2.3-syncscan/src/include/access/relscan.h	2007-03-13 21:58:56.000000000 -0700
***************
*** 19,24 ****
--- 19,33 ----
  #include "utils/tqual.h"
  
  
+ /* 
+  * Structure of an entry in the
+  * Sync Scan Hint Table.
+  */
+ typedef struct {
+ 	Oid         relid;    /* The relid that tags this hint entry */
+ 	BlockNumber location; /* The location in the relation */
+ } ss_hint_t;
+ 
  typedef struct HeapScanDescData
  {
  	/* scan parameters */
***************
*** 33,38 ****
--- 42,49 ----
  	bool		rs_inited;		/* false = scan not init'd yet */
  	HeapTupleData rs_ctup;		/* current tuple in scan, if any */
  	BlockNumber rs_cblock;		/* current block # in scan, if any */
+ 	BlockNumber rs_start_page;  /* page where this scan began */
+ 	ss_hint_t	*rs_hint;		/* pointer to scan hint */
  	Buffer		rs_cbuf;		/* current buffer in scan, if any */
  	/* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
  	ItemPointerData rs_mctid;	/* marked scan position, if any */
diff -cr postgresql-8.2.3/src/include/utils/guc_tables.h postgresql-8.2.3-syncscan/src/include/utils/guc_tables.h
*** postgresql-8.2.3/src/include/utils/guc_tables.h	2006-10-03 14:11:55.000000000 -0700
--- postgresql-8.2.3-syncscan/src/include/utils/guc_tables.h	2007-03-13 22:41:15.000000000 -0700
***************
*** 56,61 ****
--- 56,62 ----
  	QUERY_TUNING_METHOD,
  	QUERY_TUNING_COST,
  	QUERY_TUNING_GEQO,
+ 	QUERY_TUNING_SYNC_SEQSCAN,
  	QUERY_TUNING_OTHER,
  	LOGGING,
  	LOGGING_WHERE,
---------------------------(end of broadcast)---------------------------
TIP 1: if posting/reading through Usenet, please send an appropriate
       subscribe-nomail command to [EMAIL PROTECTED] so that your
       message can get through to the mailing list cleanly

Reply via email to