On Thu, 2007-03-22 at 16:43 -0400, Bruce Momjian wrote:
> Will use '16' rather than '100'.
> 
> Your patch has been added to the PostgreSQL unapplied patches list at:
> 
>       http://momjian.postgresql.org/cgi-bin/pgpatches
> 
> It will be applied as soon as one of the PostgreSQL committers reviews
> and approves it.
> 

Here is the latest version, which includes the change to report every 16
pages. 

This patch has the following improvements:

 * reporting interval to 16 pages
 * rearranges the scan location tracing output to work regardless of the
reporting interval. Previously it did not trace the output correctly if
the logging interval was not an even multiple of the reporting interval
 * GUC trace_sync_seqscan=<bool> now controls whether the DEBUG output
is generated or not. If this is true, a lot of logging output will be
generated at DEBUG3. 
 * You can set sync_seqscan_threshold=<-1.0 ... 100.0>. Positive values
are treated as a fraction of NBuffers. Negative values disable sync
scans.
 
Still TODO:

 * Publish my test results (I've collected much of the raw data already
on this version of the patch)
 * SGML documentation (after we stabilize the GUC names and meanings)
 * Possibly remove sync_seqscan_threshold=<real> and instead use a
simple enable/disable boolean that sets the threshold at a constant
fraction of NBuffers (most likely the same fraction as Simon's recycle
buffers patch)

Regards,
        Jeff Davis
diff -cr postgresql-8.2.3/src/backend/access/heap/heapam.c postgresql-8.2.3-ss/src/backend/access/heap/heapam.c
*** postgresql-8.2.3/src/backend/access/heap/heapam.c	Sun Feb  4 12:00:49 2007
--- postgresql-8.2.3-ss/src/backend/access/heap/heapam.c	Tue Mar 20 16:12:12 2007
***************
*** 65,70 ****
--- 65,275 ----
   * ----------------------------------------------------------------
   */
  
+ static BlockNumber ss_init(HeapScanDesc);
+ static int         ss_store_hint(HeapScanDesc,BlockNumber);
+ static int         ss_hash(HeapScanDesc);
+ bool Trace_sync_seqscan = false;
+ double sync_seqscan_threshold = DEFAULT_SYNC_SCAN_THRESHOLD;
+ double sync_seqscan_offset = DEFAULT_SYNC_SCAN_OFFSET;
+ 
+ /*
+  * ss_init: 
+  *
+  * This function reads the Sync Scan Hint Table 
+  * (creating it if it doesn't already exist) to 
+  * find a possible location for an already running 
+  * sequential scan on this relation.
+  *
+  * By starting a sequential scan near the location
+  * of an already running scan, we improve the chance
+  * of finding pages in cache.
+  *
+  * Also, depending on SYNC_SCAN_START_OFFSET, this
+  * function will subtract from the hint before
+  * starting the scan, in order to pick up pages that
+  * are likely to already be in cache.
+  *
+  * This function assumes that scan->rs_nblocks is 
+  * already properly set, and sets scan->rs_start_page
+  * to a value based on the hint found. Also, it sets
+  * scan->rs_hint to point to the location of the hint
+  * in the hint table.
+  */
+ static BlockNumber ss_init(HeapScanDesc scan)
+ {
+ 	ss_hint_t *hint_table;
+ 	int table_offset;
+ 	bool found;
+ 	int threshold = sync_seqscan_threshold * NBuffers;
+ 	int offset = sync_seqscan_offset * NBuffers;
+ 
+ 	/*
+ 	 * If the table is not large enough, or sync_scan_threshold 
+ 	 * is disabled (negative), don't Sync Scan.
+ 	 */
+ 	if(threshold < 0 || scan->rs_nblocks < threshold)
+ 	{
+ 		scan->rs_start_page = 0;
+ 		return 0;
+ 	}
+ 
+ 	table_offset = ss_hash(scan);
+ 	hint_table = (ss_hint_t*)ShmemInitStruct("Sync Scan Hint Table",
+ 		SYNC_SCAN_TABLE_SIZE*sizeof(ss_hint_t),&found);
+ 			
+ 	scan->rs_hint = &hint_table[table_offset];
+ 
+ 	/*
+ 	 * If we just created the hint table for the first time,
+ 	 * initialize the table to zero and start the scan at page 0.
+ 	 */
+ 	if(!found) {
+ 		if(Trace_sync_seqscan)
+ 			elog(DEBUG2,"SYNC_SCAN: Created Hint Table");
+ 		memset(hint_table,0,sizeof(ss_hint_t)*SYNC_SCAN_TABLE_SIZE);
+ 		scan->rs_start_page = 0;
+ 		return 0;
+ 	}
+ 
+ 	/*
+ 	 * If the hint's relid is 0, that means
+ 	 * we have not previously created a hint
+ 	 * at this location in the table.
+ 	 */
+ 	if(scan->rs_hint->relid == 0) {
+ 		if(Trace_sync_seqscan)
+ 			elog(DEBUG2, "SYNC_SCAN: Hint empty");
+ 		scan->rs_start_page = 0;
+ 		return 0;
+ 	}
+ 
+ 	/*
+ 	 * If the relid doesn't match the one in the hint,
+ 	 * we have a hash collision.
+ 	 */
+ 	if(RelationGetRelid(scan->rs_rd) != scan->rs_hint->relid)
+ 	{
+ 		if(Trace_sync_seqscan)
+ 			elog(DEBUG1,"SYNC_SCAN: Hash collision");
+ 		scan->rs_start_page = 0;
+ 		return 0;
+ 	}
+ 
+ 	/*
+ 	 * If the hint is not a valid block number
+ 	 * for this relation, start at 0.
+ 	 *
+ 	 * This can happen if, for instance, someone
+ 	 * TRUNCATEd the table between when the hint 
+ 	 * was set and now.
+ 	 */
+ 	if(scan->rs_hint->location < 0 || 
+ 		scan->rs_hint->location >= scan->rs_nblocks) 
+ 	{
+ 		if(Trace_sync_seqscan)
+ 			elog(DEBUG2,"SYNC_SCAN: Hint %d out of range." \
+ 				" Relation has %d pages.",
+ 				scan->rs_hint->location,scan->rs_nblocks);
+ 		scan->rs_start_page = 0;
+ 		return 0;
+ 	}
+ 
+ 	scan->rs_start_page = scan->rs_hint->location;
+ 
+ 	/* 
+ 	 * By starting at offset earlier than the hint,
+ 	 * it's likely that all of the blocks will already be 
+ 	 * cached, and the scan will quickly catch up to the head.
+ 	 *
+ 	 * offset is a positive value that will be
+ 	 * subtracted from the hint.
+ 	 */
+ 	if(offset > scan->rs_nblocks)
+ 	{
+ 		if(Trace_sync_seqscan)
+ 			elog(DEBUG2,"SYNC_SCAN: Relation smaller than start offset: %d",
+ 				offset);
+ 		return 0;
+ 	}
+ 
+ 	/*
+ 	 * If subtracting the offset would bring the value
+ 	 * to less than 0, we circle backwards to the end of the
+ 	 * file.
+ 	 */
+ 	if(offset > scan->rs_start_page)
+ 		scan->rs_start_page += scan->rs_nblocks;
+ 
+ 	scan->rs_start_page -= offset;
+ 
+ 	if(Trace_sync_seqscan)
+ 		elog(DEBUG2,"SYNC_SCAN: START: OID = %d; Location = %d; Size: %d",
+ 			RelationGetRelid(scan->rs_rd),
+ 			scan->rs_start_page,scan->rs_nblocks);
+ 
+ 	return 0;
+ }
+ 
+ /* 
+  * ss_store_hint:
+  *
+  * Writes an entry in the Sync Scan Hint Table
+  * of the form (relid,blocknumber). This will
+  * overwrite any existing entry that may collide
+  * with this entry in the table.
+  *
+  * No locking is performed here. When this data is
+  * later read by ss_init(), sanity checking is 
+  * performed to ensure we don't use an invalid
+  * relation block number.
+  */
+ static int ss_store_hint(HeapScanDesc scan, BlockNumber location) 
+ {
+ 	ss_hint_t hint;
+ 	int threshold = sync_seqscan_threshold * NBuffers;
+ 	int offset = sync_seqscan_offset * NBuffers;
+ 
+ 	/*
+ 	 * If the table is not large enough, or sync_scan_threshold 
+ 	 * is disabled (negative), don't Sync Scan.
+ 	 */
+ 	if(threshold < 0 || scan->rs_nblocks < threshold)
+ 		return 0;
+ 
+ 	/*
+ 	 * If this scan has been progressing for less
+ 	 * than offset pages, don't store the hint.
+ 	 */
+ 	if(location >= scan->rs_start_page)
+ 	{
+ 		if((location - scan->rs_start_page) < offset)
+ 			return 0;
+ 	}
+ 	else
+ 	{
+ 		if((location + scan->rs_nblocks - scan->rs_start_page) 
+ 			< offset)
+ 			return 0;
+ 	}
+ 	
+ 	hint.relid = RelationGetRelid(scan->rs_rd);
+ 	hint.location = location;
+ 
+ 	*scan->rs_hint = hint;
+   
+ 	return 0;
+ }
+ 
+ /*
+  * This is a simplistic function to hash
+  * the Oid of the relation for placement in
+  * the Sync Scan Hint Table
+  */
+ static int ss_hash(HeapScanDesc scan)
+ {
+ 	return RelationGetRelid(scan->rs_rd) % SYNC_SCAN_TABLE_SIZE;
+ }
+ 
  /* ----------------
   *		initscan - scan code common to heap_beginscan and heap_rescan
   * ----------------
***************
*** 81,86 ****
--- 286,296 ----
  	 */
  	scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
  
+ 	/*
+ 	 * Choose an good place to start the relation scan.
+ 	 */
+ 	ss_init(scan);
+ 
  	scan->rs_inited = false;
  	scan->rs_ctup.t_data = NULL;
  	ItemPointerSetInvalid(&scan->rs_ctup.t_self);
***************
*** 223,229 ****
  				tuple->t_data = NULL;
  				return;
  			}
! 			page = 0;			/* first page */
  			heapgetpage(scan, page);
  			lineoff = FirstOffsetNumber;		/* first offnum */
  			scan->rs_inited = true;
--- 433,443 ----
  				tuple->t_data = NULL;
  				return;
  			}
! 			/*
! 			 * start the scan at the location that we chose
! 			 * in ss_init()
! 			 */
! 			page = scan->rs_start_page;
  			heapgetpage(scan, page);
  			lineoff = FirstOffsetNumber;		/* first offnum */
  			scan->rs_inited = true;
***************
*** 364,378 ****
  		}
  
  		/*
! 		 * if we get here, it means we've exhausted the items on this page and
  		 * it's time to move to the next.
  		 */
  		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
  
  		/*
! 		 * return NULL if we've exhausted all the pages
  		 */
! 		if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks))
  		{
  			if (BufferIsValid(scan->rs_cbuf))
  				ReleaseBuffer(scan->rs_cbuf);
--- 578,615 ----
  		}
  
  		/*
! 		 * If we get here, it means we've exhausted the items on this page and
  		 * it's time to move to the next.
+ 		 *
+ 		 * For the forward scan, we need to wrap around to the beginning
+ 		 * of the relation file if we reach the end.
  		 */
  		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
  
+ 		if(backward)
+ 			page--;
+ 		else
+ 			page = (page + 1) % (scan->rs_nblocks);
+ 
+ 		if(Trace_sync_seqscan)
+ 		{
+ 			if (!(page%50000))
+ 				elog(DEBUG2,"page: %d",page);
+ 			else if (!(page%5000))
+ 				elog(DEBUG3,"page: %d",page);
+ 		}
+ 
+ 		if(! (page % SYNC_SCAN_REPORT_INTERVAL) )
+ 			ss_store_hint(scan,page);
+ 
  		/*
! 		 * Return NULL if we've exhausted all the pages.
! 		 * For reverse scans, that means we've reached 0. For 
! 		 * forward scans, that means we've reached the page on
! 		 * which we started.
  		 */
! 		if ((backward && (page == 0)) ||
! 			((page%(scan->rs_nblocks)) == scan->rs_start_page))
  		{
  			if (BufferIsValid(scan->rs_cbuf))
  				ReleaseBuffer(scan->rs_cbuf);
***************
*** 383,390 ****
  			return;
  		}
  
- 		page = backward ? (page - 1) : (page + 1);
- 
  		heapgetpage(scan, page);
  
  		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
--- 620,625 ----
***************
*** 450,456 ****
  				tuple->t_data = NULL;
  				return;
  			}
! 			page = 0;			/* first page */
  			heapgetpage(scan, page);
  			lineindex = 0;
  			scan->rs_inited = true;
--- 685,695 ----
  				tuple->t_data = NULL;
  				return;
  			}
! 			/*
! 			 * start the scan at the location that we chose
! 			 * in ss_init()
! 			 */
! 			page = scan->rs_start_page;
  			heapgetpage(scan, page);
  			lineindex = 0;
  			scan->rs_inited = true;
***************
*** 585,598 ****
  		}
  
  		/*
! 		 * if we get here, it means we've exhausted the items on this page and
  		 * it's time to move to the next.
  		 */
  
  		/*
! 		 * return NULL if we've exhausted all the pages
  		 */
! 		if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks))
  		{
  			if (BufferIsValid(scan->rs_cbuf))
  				ReleaseBuffer(scan->rs_cbuf);
--- 824,859 ----
  		}
  
  		/*
! 		 * If we get here, it means we've exhausted the items on this page and
  		 * it's time to move to the next.
+ 		 *
+ 		 * For the forward scan, we need to wrap around to the beginning
+ 		 * of the relation file if we reach the end.
  		 */
+ 		if(backward)
+ 			page--;
+ 		else
+ 			page = (page + 1) % (scan->rs_nblocks);
+ 
+ 		if(Trace_sync_seqscan)
+ 		{
+ 			if (!(page%50000))
+ 				elog(DEBUG2,"page: %d",page);
+ 			else if (!(page%5000))
+ 				elog(DEBUG3,"page: %d",page);
+ 		}
+ 
+ 		if(! (page % SYNC_SCAN_REPORT_INTERVAL) )
+ 			ss_store_hint(scan,page);
  
  		/*
! 		 * Return NULL if we've exhausted all the pages.
! 		 * For reverse scans, that means we've reached 0. For 
! 		 * forward scans, that means we've reached the page on
! 		 * which we started.
  		 */
! 		if ((backward && (page == 0)) || 
! 			((page%(scan->rs_nblocks)) == scan->rs_start_page))
  		{
  			if (BufferIsValid(scan->rs_cbuf))
  				ReleaseBuffer(scan->rs_cbuf);
***************
*** 603,609 ****
  			return;
  		}
  
- 		page = backward ? (page - 1) : (page + 1);
  		heapgetpage(scan, page);
  
  		dp = (Page) BufferGetPage(scan->rs_cbuf);
--- 864,869 ----
***************
*** 616,621 ****
--- 876,892 ----
  	}
  }
  
+ /*
+  * SyncScanShmemSize:
+  *
+  * Called by CreateSharedMemoryAndSemaphores()
+  * to find out how much room the Sync Scan Hint
+  * Table will need to occupy.
+  */
+ Size SyncScanShmemSize(void)
+ {
+ 	return SYNC_SCAN_TABLE_SIZE*sizeof(ss_hint_t);
+ }
  
  #if defined(DISABLE_COMPLEX_MACRO)
  /*
Only in postgresql-8.2.3-ss/src/backend/access/heap: heapam.c.orig
diff -cr postgresql-8.2.3/src/backend/storage/ipc/ipci.c postgresql-8.2.3-ss/src/backend/storage/ipc/ipci.c
*** postgresql-8.2.3/src/backend/storage/ipc/ipci.c	Sun Oct 15 15:04:07 2006
--- postgresql-8.2.3-ss/src/backend/storage/ipc/ipci.c	Tue Mar 20 16:10:31 2007
***************
*** 19,24 ****
--- 19,25 ----
  #include "access/nbtree.h"
  #include "access/subtrans.h"
  #include "access/twophase.h"
+ #include "access/heapam.h"
  #include "miscadmin.h"
  #include "pgstat.h"
  #include "postmaster/bgwriter.h"
***************
*** 110,115 ****
--- 111,117 ----
  		size = add_size(size, FreeSpaceShmemSize());
  		size = add_size(size, BgWriterShmemSize());
  		size = add_size(size, BTreeShmemSize());
+ 		size = add_size(size, SyncScanShmemSize());
  #ifdef EXEC_BACKEND
  		size = add_size(size, ShmemBackendArraySize());
  #endif
Only in postgresql-8.2.3-ss/src/backend/storage/ipc: ipci.c.orig
diff -cr postgresql-8.2.3/src/backend/utils/misc/guc.c postgresql-8.2.3-ss/src/backend/utils/misc/guc.c
*** postgresql-8.2.3/src/backend/utils/misc/guc.c	Wed Nov 29 06:50:07 2006
--- postgresql-8.2.3-ss/src/backend/utils/misc/guc.c	Tue Mar 20 16:10:31 2007
***************
*** 25,31 ****
  #include <syslog.h>
  #endif
  
! 
  #include "access/gin.h"
  #include "access/twophase.h"
  #include "access/xact.h"
--- 25,31 ----
  #include <syslog.h>
  #endif
  
! #include "access/heapam.h"
  #include "access/gin.h"
  #include "access/twophase.h"
  #include "access/xact.h"
***************
*** 758,763 ****
--- 758,773 ----
  		false, NULL, NULL
  	},
  
+ 	{
+ 		{"trace_sync_seqscan", PGC_USERSET, DEVELOPER_OPTIONS,
+ 			gettext_noop("Generates debugging output for Synchronized Scans."),
+ 			NULL,
+ 			GUC_NOT_IN_SAMPLE
+ 		},
+ 		&Trace_sync_seqscan,
+ 		false, NULL, NULL
+ 	},
+ 
  #ifdef LOCK_DEBUG
  	{
  		{"trace_locks", PGC_SUSET, DEVELOPER_OPTIONS,
***************
*** 1722,1727 ****
--- 1732,1753 ----
  		&Geqo_selection_bias,
  		DEFAULT_GEQO_SELECTION_BIAS, MIN_GEQO_SELECTION_BIAS,
  		MAX_GEQO_SELECTION_BIAS, NULL, NULL
+ 	},
+ 	{
+ 		{"sync_seqscan_threshold", PGC_USERSET, QUERY_TUNING_SYNC_SEQSCAN,
+ 			gettext_noop("Minimum size of table before synchronized scanning takes effect, as a fraction of shared_buffers."),
+ 			NULL
+ 		},
+ 		&sync_seqscan_threshold,
+ 		DEFAULT_SYNC_SCAN_THRESHOLD, -1.0, 100.0, NULL, NULL
+ 	},
+ 	{
+ 		{"sync_seqscan_offset", PGC_USERSET, QUERY_TUNING_SYNC_SEQSCAN,
+ 			gettext_noop("Start synchronized scans at this offset (as a fraction of shared_buffers) before other scans."),
+ 			NULL
+ 		},
+ 		&sync_seqscan_offset,
+ 		DEFAULT_SYNC_SCAN_OFFSET, 0.0, 100.0, NULL, NULL
  	},
  
  	{
Only in postgresql-8.2.3-ss/src/backend/utils/misc: guc.c.orig
diff -cr postgresql-8.2.3/src/include/access/heapam.h postgresql-8.2.3-ss/src/include/access/heapam.h
*** postgresql-8.2.3/src/include/access/heapam.h	Sun Nov  5 14:42:10 2006
--- postgresql-8.2.3-ss/src/include/access/heapam.h	Tue Mar 20 16:10:31 2007
***************
*** 25,30 ****
--- 25,49 ----
  #include "utils/rel.h"
  #include "utils/tqual.h"
  
+ /*
+  * Size of the Sync Scan Hint Table.
+  */
+ #define SYNC_SCAN_TABLE_SIZE   1000
+ 
+ /*
+  * Interval between reports of the location
+  * of the current scan, in pages.
+  */
+ #define SYNC_SCAN_REPORT_INTERVAL 16
+ 
+ #define DEFAULT_SYNC_SCAN_THRESHOLD 1.0
+ #define DEFAULT_SYNC_SCAN_OFFSET 0.0
+ 
+ extern DLLIMPORT bool Trace_sync_seqscan;
+ extern DLLIMPORT double sync_seqscan_threshold;
+ extern DLLIMPORT double sync_seqscan_offset;
+ extern Size SyncScanShmemSize(void);
+ 
  /* ----------------
   *		fastgetattr
   *
Only in postgresql-8.2.3-ss/src/include/access: heapam.h.orig
diff -cr postgresql-8.2.3/src/include/access/relscan.h postgresql-8.2.3-ss/src/include/access/relscan.h
*** postgresql-8.2.3/src/include/access/relscan.h	Tue Oct  3 17:30:07 2006
--- postgresql-8.2.3-ss/src/include/access/relscan.h	Tue Mar 20 16:10:31 2007
***************
*** 19,24 ****
--- 19,33 ----
  #include "utils/tqual.h"
  
  
+ /* 
+  * Structure of an entry in the
+  * Sync Scan Hint Table.
+  */
+ typedef struct {
+ 	Oid         relid;    /* The relid that tags this hint entry */
+ 	BlockNumber location; /* The location in the relation */
+ } ss_hint_t;
+ 
  typedef struct HeapScanDescData
  {
  	/* scan parameters */
***************
*** 33,38 ****
--- 42,49 ----
  	bool		rs_inited;		/* false = scan not init'd yet */
  	HeapTupleData rs_ctup;		/* current tuple in scan, if any */
  	BlockNumber rs_cblock;		/* current block # in scan, if any */
+ 	BlockNumber rs_start_page;  /* page where this scan began */
+ 	ss_hint_t	*rs_hint;		/* pointer to scan hint */
  	Buffer		rs_cbuf;		/* current buffer in scan, if any */
  	/* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
  	ItemPointerData rs_mctid;	/* marked scan position, if any */
Only in postgresql-8.2.3-ss/src/include/access: relscan.h.orig
diff -cr postgresql-8.2.3/src/include/utils/guc_tables.h postgresql-8.2.3-ss/src/include/utils/guc_tables.h
*** postgresql-8.2.3/src/include/utils/guc_tables.h	Tue Oct  3 14:11:55 2006
--- postgresql-8.2.3-ss/src/include/utils/guc_tables.h	Tue Mar 20 16:10:31 2007
***************
*** 56,61 ****
--- 56,62 ----
  	QUERY_TUNING_METHOD,
  	QUERY_TUNING_COST,
  	QUERY_TUNING_GEQO,
+ 	QUERY_TUNING_SYNC_SEQSCAN,
  	QUERY_TUNING_OTHER,
  	LOGGING,
  	LOGGING_WHERE,
Only in postgresql-8.2.3-ss/src/include/utils: guc_tables.h.orig
---------------------------(end of broadcast)---------------------------
TIP 4: Have you searched our list archives?

               http://archives.postgresql.org

Reply via email to