This is my latest revision of the Sync Scan patch, and it implements the observability as discussed with Simon.
Changes: * ss_report_loc() called once per hundred pages rather than once per page * DEBUG messages are a little cleaner and easier to parse, for the sake of analysis after the fact. * DEBUG2 reports a sync scan starting, the relation size in pages, and the location at which the scan starts. * DEBUG2 reports the location of a scan every 50k pages, DEBUG3 every 5k pages (before it was 100k/10k at DEBUG3/DEBUG4, respectively). Numbers are aligned along 5k boundaries to make analysis easier. * GUCs: * sync_seqscan_threshold: fraction of NBuffers for the threshold * sync_seqscan_offset: fraction of NBuffers for the offset * trace_sync_seqscan: will be used in final version of patch to control DEBUG output Sync_scan_offset may be eliminated completely if it's not shown to be useful enough in conjunction with Simon's patch. Sync Scans are still a big win without sync_seqscan_offset. Sync_scan_threshold=<real> may be turned into sync_seqscan=<boolean> with a fixed activation threshold (NBuffers/2 per Simon's suggestion). The reason is that synchronized scans should activate at the same threshold as Simon's scan_recycle_buffers feature. Should we make a "#define BIG_SCAN_THRESHOLD NBuffers/2" to use for both sync_seqscan and for scan_recycle_buffers? Regards, Jeff Davis
diff -cr postgresql-8.2.3/src/backend/access/heap/heapam.c postgresql-8.2.3-syncscan/src/backend/access/heap/heapam.c *** postgresql-8.2.3/src/backend/access/heap/heapam.c 2007-02-04 12:00:49.000000000 -0800 --- postgresql-8.2.3-syncscan/src/backend/access/heap/heapam.c 2007-03-13 23:21:27.000000000 -0700 *************** *** 65,70 **** --- 65,279 ---- * ---------------------------------------------------------------- */ + static BlockNumber ss_init(HeapScanDesc); + static int ss_store_hint(HeapScanDesc,BlockNumber); + static int ss_hash(HeapScanDesc); + bool Trace_sync_seqscan = false; + double sync_seqscan_threshold = DEFAULT_SYNC_SCAN_THRESHOLD; + double sync_seqscan_offset = DEFAULT_SYNC_SCAN_OFFSET; + + /* + * ss_init: + * + * This function reads the Sync Scan Hint Table + * (creating it if it doesn't already exist) to + * find a possible location for an already running + * sequential scan on this relation. + * + * By starting a sequential scan near the location + * of an already running scan, we improve the chance + * of finding pages in cache. + * + * Also, depending on SYNC_SCAN_START_OFFSET, this + * function will subtract from the hint before + * starting the scan, in order to pick up pages that + * are likely to already be in cache. + * + * This function assumes that scan->rs_nblocks is + * already properly set, and sets scan->rs_start_page + * to a value based on the hint found. Also, it sets + * scan->rs_hint to point to the location of the hint + * in the hint table. + */ + static BlockNumber ss_init(HeapScanDesc scan) + { + ss_hint_t *hint_table; + int table_offset; + bool found; + int threshold = sync_seqscan_threshold * NBuffers; + int offset = sync_seqscan_offset * NBuffers; + + /* + * If the table is not large compared to effective_cache_size, + * don't Sync Scan. + */ + if(scan->rs_nblocks < threshold) + { + elog(DEBUG2,"SYNC_SCAN: Table too small to sync scan"); + scan->rs_start_page = 0; + return 0; + } + + table_offset = ss_hash(scan); + hint_table = (ss_hint_t*)ShmemInitStruct("Sync Scan Hint Table", + SYNC_SCAN_TABLE_SIZE*sizeof(ss_hint_t),&found); + + scan->rs_hint = &hint_table[table_offset]; + + /* + * If we just created the hint table for the first time, + * initialize the table to zero and start the scan at page 0. + */ + if(!found) { + elog(DEBUG2,"SYNC_SCAN: Created Hint Table"); + memset(hint_table,0,sizeof(ss_hint_t)*SYNC_SCAN_TABLE_SIZE); + scan->rs_start_page = 0; + return 0; + } + + /* + * If the hint's relid is 0, that means + * we have not previously created a hint + * at this location in the table. + */ + if(scan->rs_hint->relid == 0) { + elog(DEBUG2, "SYNC_SCAN: Hint empty"); + scan->rs_start_page = 0; + return 0; + } + + /* + * If the relid doesn't match the one in the hint, + * we have a hash collision. + */ + if(RelationGetRelid(scan->rs_rd) != scan->rs_hint->relid) + { + elog(DEBUG1,"SYNC_SCAN: Hash collision"); + scan->rs_start_page = 0; + return 0; + } + + /* + * If the hint is not a valid block number + * for this relation, start at 0. + * + * This can happen if, for instance, someone + * TRUNCATEd the table between when the hint + * was set and now. + */ + if(scan->rs_hint->location < 0 || + scan->rs_hint->location >= scan->rs_nblocks) + { + elog(DEBUG2,"SYNC_SCAN: Hint %d out of range." \ + " Relation has %d pages.", + scan->rs_hint->location,scan->rs_nblocks); + scan->rs_start_page = 0; + return 0; + } + + scan->rs_start_page = scan->rs_hint->location; + + /* + * By starting at offset earlier than the hint, + * it's likely that all of the blocks will already be + * cached, and the scan will quickly catch up to the head. + * + * offset is a positive value that will be + * subtracted from the hint. + */ + if(offset > scan->rs_nblocks) + { + elog(DEBUG2,"SYNC_SCAN: Relation smaller than start offset: %d", + offset); + return 0; + } + + /* + * If subtracting the offset would bring the value + * to less than 0, we circle backwards to the end of the + * file. + */ + if(offset > scan->rs_start_page) + scan->rs_start_page += scan->rs_nblocks; + + scan->rs_start_page -= offset; + + elog(DEBUG2,"SYNC_SCAN: START: OID = %d; Location = %d; Size: %d", + RelationGetRelid(scan->rs_rd), + scan->rs_start_page,scan->rs_nblocks); + + return 0; + } + + /* + * ss_store_hint: + * + * Writes an entry in the Sync Scan Hint Table + * of the form (relid,blocknumber). This will + * overwrite any existing entry that may collide + * with this entry in the table. + * + * No locking is performed here. When this data is + * later read by ss_init(), sanity checking is + * performed to ensure we don't use an invalid + * relation block number. + */ + static int ss_store_hint(HeapScanDesc scan, BlockNumber location) + { + ss_hint_t hint; + int threshold = sync_seqscan_threshold * NBuffers; + int offset = sync_seqscan_offset * NBuffers; + + /* + * Print every 100k pages to DEBUG3 + * and every 10k pages to DEBUG4. + */ + if (!(location%50000)) + elog(DEBUG2,"page: %d",location); + else if (!(location%5000)) + elog(DEBUG3,"page: %d",location); + + /* + * If the table is too small, don't bother + * with Sync Scan. + */ + if(scan->rs_nblocks < threshold) + return 0; + + /* + * If this scan has been progressing for less + * than offset pages, don't store the hint. + */ + if(location >= scan->rs_start_page) + { + if((location - scan->rs_start_page) < offset) + return 0; + } + else + { + if((location + scan->rs_nblocks - scan->rs_start_page) + < offset) + return 0; + } + + hint.relid = RelationGetRelid(scan->rs_rd); + hint.location = location; + + *scan->rs_hint = hint; + + return 0; + } + + /* + * This is a simplistic function to hash + * the Oid of the relation for placement in + * the Sync Scan Hint Table + */ + static int ss_hash(HeapScanDesc scan) + { + return RelationGetRelid(scan->rs_rd) % SYNC_SCAN_TABLE_SIZE; + } + /* ---------------- * initscan - scan code common to heap_beginscan and heap_rescan * ---------------- *************** *** 81,86 **** --- 290,300 ---- */ scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd); + /* + * Choose an good place to start the relation scan. + */ + ss_init(scan); + scan->rs_inited = false; scan->rs_ctup.t_data = NULL; ItemPointerSetInvalid(&scan->rs_ctup.t_self); *************** *** 223,229 **** tuple->t_data = NULL; return; } ! page = 0; /* first page */ heapgetpage(scan, page); lineoff = FirstOffsetNumber; /* first offnum */ scan->rs_inited = true; --- 437,447 ---- tuple->t_data = NULL; return; } ! /* ! * start the scan at the location that we chose ! * in ss_init() ! */ ! page = scan->rs_start_page; heapgetpage(scan, page); lineoff = FirstOffsetNumber; /* first offnum */ scan->rs_inited = true; *************** *** 364,378 **** } /* ! * if we get here, it means we've exhausted the items on this page and * it's time to move to the next. */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); /* ! * return NULL if we've exhausted all the pages */ ! if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks)) { if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); --- 582,611 ---- } /* ! * If we get here, it means we've exhausted the items on this page and * it's time to move to the next. + * + * For the forward scan, we need to wrap around to the beginning + * of the relation file if we reach the end. */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + if(backward) + page--; + else + page = (page + 1) % (scan->rs_nblocks); + + if(! (page % SYNC_SCAN_REPORT_INTERVAL) ) + ss_store_hint(scan,page); + /* ! * Return NULL if we've exhausted all the pages. ! * For reverse scans, that means we've reached 0. For ! * forward scans, that means we've reached the page on ! * which we started. */ ! if ((backward && (page == 0)) || ! ((page%(scan->rs_nblocks)) == scan->rs_start_page)) { if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); *************** *** 383,390 **** return; } - page = backward ? (page - 1) : (page + 1); - heapgetpage(scan, page); LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); --- 616,621 ---- *************** *** 450,456 **** tuple->t_data = NULL; return; } ! page = 0; /* first page */ heapgetpage(scan, page); lineindex = 0; scan->rs_inited = true; --- 681,691 ---- tuple->t_data = NULL; return; } ! /* ! * start the scan at the location that we chose ! * in ss_init() ! */ ! page = scan->rs_start_page; heapgetpage(scan, page); lineindex = 0; scan->rs_inited = true; *************** *** 585,598 **** } /* ! * if we get here, it means we've exhausted the items on this page and * it's time to move to the next. */ /* ! * return NULL if we've exhausted all the pages */ ! if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks)) { if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); --- 820,847 ---- } /* ! * If we get here, it means we've exhausted the items on this page and * it's time to move to the next. + * + * For the forward scan, we need to wrap around to the beginning + * of the relation file if we reach the end. */ + if(backward) + page--; + else + page = (page + 1) % (scan->rs_nblocks); + + if(! (page % SYNC_SCAN_REPORT_INTERVAL) ) + ss_store_hint(scan,page); /* ! * Return NULL if we've exhausted all the pages. ! * For reverse scans, that means we've reached 0. For ! * forward scans, that means we've reached the page on ! * which we started. */ ! if ((backward && (page == 0)) || ! ((page%(scan->rs_nblocks)) == scan->rs_start_page)) { if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); *************** *** 603,609 **** return; } - page = backward ? (page - 1) : (page + 1); heapgetpage(scan, page); dp = (Page) BufferGetPage(scan->rs_cbuf); --- 852,857 ---- *************** *** 616,621 **** --- 864,880 ---- } } + /* + * SyncScanShmemSize: + * + * Called by CreateSharedMemoryAndSemaphores() + * to find out how much room the Sync Scan Hint + * Table will need to occupy. + */ + Size SyncScanShmemSize(void) + { + return SYNC_SCAN_TABLE_SIZE*sizeof(ss_hint_t); + } #if defined(DISABLE_COMPLEX_MACRO) /* diff -cr postgresql-8.2.3/src/backend/storage/ipc/ipci.c postgresql-8.2.3-syncscan/src/backend/storage/ipc/ipci.c *** postgresql-8.2.3/src/backend/storage/ipc/ipci.c 2006-10-15 15:04:07.000000000 -0700 --- postgresql-8.2.3-syncscan/src/backend/storage/ipc/ipci.c 2007-03-13 21:58:56.000000000 -0700 *************** *** 19,24 **** --- 19,25 ---- #include "access/nbtree.h" #include "access/subtrans.h" #include "access/twophase.h" + #include "access/heapam.h" #include "miscadmin.h" #include "pgstat.h" #include "postmaster/bgwriter.h" *************** *** 110,115 **** --- 111,117 ---- size = add_size(size, FreeSpaceShmemSize()); size = add_size(size, BgWriterShmemSize()); size = add_size(size, BTreeShmemSize()); + size = add_size(size, SyncScanShmemSize()); #ifdef EXEC_BACKEND size = add_size(size, ShmemBackendArraySize()); #endif diff -cr postgresql-8.2.3/src/backend/utils/misc/guc.c postgresql-8.2.3-syncscan/src/backend/utils/misc/guc.c *** postgresql-8.2.3/src/backend/utils/misc/guc.c 2006-11-29 06:50:07.000000000 -0800 --- postgresql-8.2.3-syncscan/src/backend/utils/misc/guc.c 2007-03-13 23:23:31.000000000 -0700 *************** *** 25,31 **** #include <syslog.h> #endif ! #include "access/gin.h" #include "access/twophase.h" #include "access/xact.h" --- 25,31 ---- #include <syslog.h> #endif ! #include "access/heapam.h" #include "access/gin.h" #include "access/twophase.h" #include "access/xact.h" *************** *** 758,763 **** --- 758,773 ---- false, NULL, NULL }, + { + {"trace_sync_seqscan", PGC_USERSET, DEVELOPER_OPTIONS, + gettext_noop("Generates debugging output for Synchronized Scans."), + NULL, + GUC_NOT_IN_SAMPLE + }, + &Trace_sync_seqscan, + false, NULL, NULL + }, + #ifdef LOCK_DEBUG { {"trace_locks", PGC_SUSET, DEVELOPER_OPTIONS, *************** *** 1723,1728 **** --- 1733,1754 ---- DEFAULT_GEQO_SELECTION_BIAS, MIN_GEQO_SELECTION_BIAS, MAX_GEQO_SELECTION_BIAS, NULL, NULL }, + { + {"sync_seqscan_threshold", PGC_USERSET, QUERY_TUNING_SYNC_SEQSCAN, + gettext_noop("Minimum size of table before synchronized scanning takes effect, as a fraction of shared_buffers."), + NULL + }, + &sync_seqscan_threshold, + DEFAULT_SYNC_SCAN_THRESHOLD, 0.0, 100.0, NULL, NULL + }, + { + {"sync_seqscan_offset", PGC_USERSET, QUERY_TUNING_SYNC_SEQSCAN, + gettext_noop("Start synchronized scans at this offset (as a fraction of shared_buffers) before other scans."), + NULL + }, + &sync_seqscan_offset, + DEFAULT_SYNC_SCAN_OFFSET, 0.0, 100.0, NULL, NULL + }, { {"bgwriter_lru_percent", PGC_SIGHUP, RESOURCES, diff -cr postgresql-8.2.3/src/include/access/heapam.h postgresql-8.2.3-syncscan/src/include/access/heapam.h *** postgresql-8.2.3/src/include/access/heapam.h 2006-11-05 14:42:10.000000000 -0800 --- postgresql-8.2.3-syncscan/src/include/access/heapam.h 2007-03-13 23:22:11.000000000 -0700 *************** *** 25,30 **** --- 25,49 ---- #include "utils/rel.h" #include "utils/tqual.h" + /* + * Size of the Sync Scan Hint Table. + */ + #define SYNC_SCAN_TABLE_SIZE 1000 + + /* + * Interval between reports of the location + * of the current scan, in pages. + */ + #define SYNC_SCAN_REPORT_INTERVAL 100 + + #define DEFAULT_SYNC_SCAN_THRESHOLD 1.0 + #define DEFAULT_SYNC_SCAN_OFFSET 0.0 + + extern DLLIMPORT bool Trace_sync_seqscan; + extern DLLIMPORT double sync_seqscan_threshold; + extern DLLIMPORT double sync_seqscan_offset; + extern Size SyncScanShmemSize(void); + /* ---------------- * fastgetattr * diff -cr postgresql-8.2.3/src/include/access/relscan.h postgresql-8.2.3-syncscan/src/include/access/relscan.h *** postgresql-8.2.3/src/include/access/relscan.h 2006-10-03 17:30:07.000000000 -0700 --- postgresql-8.2.3-syncscan/src/include/access/relscan.h 2007-03-13 21:58:56.000000000 -0700 *************** *** 19,24 **** --- 19,33 ---- #include "utils/tqual.h" + /* + * Structure of an entry in the + * Sync Scan Hint Table. + */ + typedef struct { + Oid relid; /* The relid that tags this hint entry */ + BlockNumber location; /* The location in the relation */ + } ss_hint_t; + typedef struct HeapScanDescData { /* scan parameters */ *************** *** 33,38 **** --- 42,49 ---- bool rs_inited; /* false = scan not init'd yet */ HeapTupleData rs_ctup; /* current tuple in scan, if any */ BlockNumber rs_cblock; /* current block # in scan, if any */ + BlockNumber rs_start_page; /* page where this scan began */ + ss_hint_t *rs_hint; /* pointer to scan hint */ Buffer rs_cbuf; /* current buffer in scan, if any */ /* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ ItemPointerData rs_mctid; /* marked scan position, if any */ diff -cr postgresql-8.2.3/src/include/utils/guc_tables.h postgresql-8.2.3-syncscan/src/include/utils/guc_tables.h *** postgresql-8.2.3/src/include/utils/guc_tables.h 2006-10-03 14:11:55.000000000 -0700 --- postgresql-8.2.3-syncscan/src/include/utils/guc_tables.h 2007-03-13 22:41:15.000000000 -0700 *************** *** 56,61 **** --- 56,62 ---- QUERY_TUNING_METHOD, QUERY_TUNING_COST, QUERY_TUNING_GEQO, + QUERY_TUNING_SYNC_SEQSCAN, QUERY_TUNING_OTHER, LOGGING, LOGGING_WHERE,
---------------------------(end of broadcast)--------------------------- TIP 1: if posting/reading through Usenet, please send an appropriate subscribe-nomail command to [EMAIL PROTECTED] so that your message can get through to the mailing list cleanly