Hi,

I revise this patch and re-run performance test, it can work collectry in Linux and no complile wanings. I add GUC about enable_kernel_readahead option in new version. When this GUC is on(default), it works in POSIX_FADV_NORMAL which is general readahead in OS. And when it is off, it works in POSXI_FADV_RANDOM or POSIX_FADV_SEQUENTIAL which is judged by buffer hint in Postgres, readahead parameter is optimized by postgres. We can change this parameter in their transactions everywhere and everytime.

* Test server
  Server: HP Proliant DL360 G7
  CPU:    Xeon E5640 2.66GHz (1P/4C)
  Memory: 18GB(PC3-10600R-9)
  Disk:   146GB(15k)*4 RAID1+0
  RAID controller: P410i/256MB
  OS: RHEL 6.4(x86_64)
  FS: Ext4

* Test setting
  I use "pgbench -c 8 -j 4 -T 2400 -S -P 10 -a"
I also use my accurate patch in this test. So I exexuted under following command before each benchmark.
    1. cluster all database
    2. truncate pgbench_history
    3. checkpoint
    4. sync
    5. checkpoint

* postresql.conf
shared_buffers = 2048MB
maintenance_work_mem = 64MB
wal_level = minimal
checkpoint_segments = 300
checkpoint_timeout = 15min
checkpoint_completion_target = 0.7

* Performance test result
** In memory database size
s=1000        |   1   |   2   |   3   |  avg
---------------------------------------------
readahead=on  | 39836 | 40229 | 40055 | 40040
readahead=off | 31259 | 29656 | 30693 | 30536
ratio         |  78%  |  74%  |  77%  |   76%

** Over memory database size
s=2000        |   1  |   2  |    3    |  avg
---------------------------------------------
readahead=on  | 1288 | 1370 |   1367  | 1341
readahead=off | 1683 | 1688 |   1395  | 1589
ratio         | 131% | 123% |   102%  | 118%

s=3000        |   1  |   2  |    3    |  avg
---------------------------------------------
readahead=on  |  965 |  862 |   993   |  940
readahead=off | 1113 | 1098 |   935   | 1049
ratio         | 115% | 127% |   94%   | 112%


It seems good performance expect scale factor=1000. When readahead parameter is off, disk IO keep to a minimum or necessary, therefore it is faster than "readahead=on". "readahead=on" uses useless diskIO. For example, which is faster 8KB random read or 12KB random read from disks in many times transactions? It is self-evident that the former is faster.

In scale factor 1000, it becomes to slower buffer-is-hot than "readahead=on". So it seems to less performance. But it is essence in measuring perfomance. And you can confirm it by attached benchmark graphs. We can use this parameter when buffer is reratively hot. If you want to see other trial graphs, I will send.

And I will support to MacOS and create document about this patch in this week.
#MacOS is in my house.

Regards,
--
Mitsumasa KONDO
NTT Open Source Software Center
*** a/configure
--- b/configure
***************
*** 19937,19943 **** LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
  
  
  
! for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat readlink setproctitle setsid shm_open sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l
  do
  as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
  { $as_echo "$as_me:$LINENO: checking for $ac_func" >&5
--- 19937,19943 ----
  
  
  
! for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fadvise pstat readlink setproctitle setsid shm_open sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l
  do
  as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
  { $as_echo "$as_me:$LINENO: checking for $ac_func" >&5
*** a/src/backend/commands/tablecmds.c
--- b/src/backend/commands/tablecmds.c
***************
*** 9119,9125 **** copy_relation_data(SMgrRelation src, SMgrRelation dst,
  		/* If we got a cancel signal during the copy of the data, quit */
  		CHECK_FOR_INTERRUPTS();
  
! 		smgrread(src, forkNum, blkno, buf);
  
  		if (!PageIsVerified(page, blkno))
  			ereport(ERROR,
--- 9119,9125 ----
  		/* If we got a cancel signal during the copy of the data, quit */
  		CHECK_FOR_INTERRUPTS();
  
! 		smgrread(src, forkNum, blkno, buf, (char *) BAS_BULKREAD);
  
  		if (!PageIsVerified(page, blkno))
  			ereport(ERROR,
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
***************
*** 41,46 ****
--- 41,47 ----
  #include "pg_trace.h"
  #include "pgstat.h"
  #include "postmaster/bgwriter.h"
+ #include "storage/buf.h"
  #include "storage/buf_internals.h"
  #include "storage/bufmgr.h"
  #include "storage/ipc.h"
***************
*** 451,457 **** ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
  			if (track_io_timing)
  				INSTR_TIME_SET_CURRENT(io_start);
  
! 			smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
  
  			if (track_io_timing)
  			{
--- 452,458 ----
  			if (track_io_timing)
  				INSTR_TIME_SET_CURRENT(io_start);
  
! 			smgrread(smgr, forkNum, blockNum, (char *) bufBlock, (char *) strategy);
  
  			if (track_io_timing)
  			{
*** a/src/backend/storage/file/fd.c
--- b/src/backend/storage/file/fd.c
***************
*** 73,80 ****
--- 73,82 ----
  #include "catalog/pg_tablespace.h"
  #include "common/relpath.h"
  #include "pgstat.h"
+ #include "storage/buf.h"
  #include "storage/fd.h"
  #include "storage/ipc.h"
+ #include "storage/bufmgr.h"
  #include "utils/guc.h"
  #include "utils/resowner_private.h"
  
***************
*** 123,129 **** int			max_files_per_process = 1000;
   * setting this variable, and so need not be tested separately.
   */
  int			max_safe_fds = 32;	/* default if not changed */
! 
  
  /* Debugging.... */
  
--- 125,131 ----
   * setting this variable, and so need not be tested separately.
   */
  int			max_safe_fds = 32;	/* default if not changed */
! bool			enable_kernel_readahead = true ;
  
  /* Debugging.... */
  
***************
*** 383,388 **** pg_flush_data(int fd, off_t offset, off_t amount)
--- 385,405 ----
  	return 0;
  }
  
+ /*
+  * pg_fadvise --- advise OS that the cache will need or not
+  *
+  * Not all platforms have posix_fadvise. If it does not support posix_fadvise,
+  * we do nothing about here.
+  */
+ int
+ pg_fadvise(int fd, off_t offset, off_t amount, int advise)
+ {
+ #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) && defined(POSIX_FADV_RANDOM) && defined(POSIX_FADV_SEQUENTIAL)
+ 	return posix_fadvise(fd, offset, amount, advise);
+ #else
+ 	return 0;
+ #endif
+ }
  
  /*
   * fsync_fname -- fsync a file or directory, handling errors properly
***************
*** 1142,1147 **** OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
--- 1159,1195 ----
  }
  
  /*
+  * Controling OS file cache using posix_fadvise()
+  */
+ int
+ FileCacheAdvise(File file, off_t offset, off_t amount, int advise)
+ {
+ 	return pg_fadvise(VfdCache[file].fd, offset, amount, advise);
+ }
+ 
+ /*
+  * Select OS readahead strategy using buffer hint. If we select POSIX_FADV_SEQUENTIAL,
+  * readahead parameter becomes the maximum and can read more faster. On the other hand,
+  * if we select POSIX_FADV_RANDOM, readahead wasn't executed at all and file cache
+  * replace algorithm will be more smart. Because it can calculate correct number of accesses
+  * which are hot data.
+  */
+ int
+ BufferHintIOAdvise(File file, char *offset, off_t amount, char *strategy)
+ {
+ 	if(enable_kernel_readahead)
+ 		return FileCacheAdvise(file, (off_t) offset, amount, POSIX_FADV_NORMAL);
+ 
+ 	/* readahead optimization */
+ 	if(strategy != NULL)
+ 		/* use maximum readahead setting in kernel, we can read more faster */
+ 		return FileCacheAdvise(file, (off_t) offset, amount, POSIX_FADV_SEQUENTIAL);
+ 	else
+ 		/* don't use readahead in kernel, so we can more effectively use OS file cache */
+ 		return FileCacheAdvise(file, (off_t) offset, amount, POSIX_FADV_RANDOM);
+ }
+ 
+ /*
   * close a file when done with it
   */
  void
*** a/src/backend/storage/smgr/md.c
--- b/src/backend/storage/smgr/md.c
***************
*** 162,168 **** static List *pendingUnlinks = NIL;
  static CycleCtr mdsync_cycle_ctr = 0;
  static CycleCtr mdckpt_cycle_ctr = 0;
  
- 
  typedef enum					/* behavior for mdopen & _mdfd_getseg */
  {
  	EXTENSION_FAIL,				/* ereport if segment not present */
--- 162,167 ----
***************
*** 653,659 **** mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
   */
  void
  mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
! 	   char *buffer)
  {
  	off_t		seekpos;
  	int			nbytes;
--- 652,658 ----
   */
  void
  mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
! 	   char *buffer, char *strategy)
  {
  	off_t		seekpos;
  	int			nbytes;
***************
*** 677,682 **** mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
--- 676,683 ----
  				 errmsg("could not seek to block %u in file \"%s\": %m",
  						blocknum, FilePathName(v->mdfd_vfd))));
  
+ 	/* Control buffered IO in OS by using posix_fadvise() */
+ 	BufferHintIOAdvise(v->mdfd_vfd, buffer, BLCKSZ, strategy);
  	nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
  
  	TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
*** a/src/backend/storage/smgr/smgr.c
--- b/src/backend/storage/smgr/smgr.c
***************
*** 50,56 **** typedef struct f_smgr
  	void		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
  											  BlockNumber blocknum);
  	void		(*smgr_read) (SMgrRelation reln, ForkNumber forknum,
! 										  BlockNumber blocknum, char *buffer);
  	void		(*smgr_write) (SMgrRelation reln, ForkNumber forknum,
  						 BlockNumber blocknum, char *buffer, bool skipFsync);
  	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
--- 50,56 ----
  	void		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
  											  BlockNumber blocknum);
  	void		(*smgr_read) (SMgrRelation reln, ForkNumber forknum,
! 					  BlockNumber blocknum, char *buffer, char *strategy);
  	void		(*smgr_write) (SMgrRelation reln, ForkNumber forknum,
  						 BlockNumber blocknum, char *buffer, bool skipFsync);
  	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
***************
*** 588,596 **** smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
   */
  void
  smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
! 		 char *buffer)
  {
! 	(*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer);
  }
  
  /*
--- 588,596 ----
   */
  void
  smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
! 		 char *buffer, char *strategy)
  {
! 	(*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer, strategy);
  }
  
  /*
*** a/src/backend/utils/misc/guc.c
--- b/src/backend/utils/misc/guc.c
***************
*** 762,767 **** static struct config_bool ConfigureNamesBool[] =
--- 762,776 ----
  		NULL, NULL, NULL
  	},
  	{
+ 		{"enable_kernel_readahead", PGC_USERSET, QUERY_TUNING_METHOD,
+ 			gettext_noop("On is optimize readahead by kernel, off is optimized by postgres."),
+ 			NULL
+ 		},
+ 		&enable_kernel_readahead,
+ 		true,
+ 		NULL, NULL, NULL
+ 	},
+ 	{
  		{"geqo", PGC_USERSET, QUERY_TUNING_GEQO,
  			gettext_noop("Enables genetic query optimization."),
  			gettext_noop("This algorithm attempts to do planning without "
*** a/src/backend/utils/misc/postgresql.conf.sample
--- b/src/backend/utils/misc/postgresql.conf.sample
***************
*** 135,140 ****
--- 135,142 ----
  
  #temp_file_limit = -1			# limits per-session temp file space
  					# in kB, or -1 for no limit
+ #enable_kernel_readahead = on		# on is optimized by OS,
+ 					# off is optimized by postgres
  
  # - Kernel Resource Usage -
  
*** a/src/include/storage/bufmgr.h
--- b/src/include/storage/bufmgr.h
***************
*** 44,55 **** typedef enum
--- 44,58 ----
  /* in globals.c ... this duplicates miscadmin.h */
  extern PGDLLIMPORT int NBuffers;
  
+ 
+ 
  /* in bufmgr.c */
  extern bool zero_damaged_pages;
  extern int	bgwriter_lru_maxpages;
  extern double bgwriter_lru_multiplier;
  extern bool track_io_timing;
  extern int	target_prefetch_pages;
+ extern bool	enable_kernel_readahead;
  
  /* in buf_init.c */
  extern PGDLLIMPORT char *BufferBlocks;
*** a/src/include/storage/fd.h
--- b/src/include/storage/fd.h
***************
*** 68,73 **** extern int	max_safe_fds;
--- 68,74 ----
  extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode);
  extern File OpenTemporaryFile(bool interXact);
  extern void FileClose(File file);
+ extern int	FileCacheAdvise(File file, off_t offset, off_t amount, int advise);
  extern int	FilePrefetch(File file, off_t offset, int amount);
  extern int	FileRead(File file, char *buffer, int amount);
  extern int	FileWrite(File file, char *buffer, int amount);
***************
*** 75,80 **** extern int	FileSync(File file);
--- 76,82 ----
  extern off_t FileSeek(File file, off_t offset, int whence);
  extern int	FileTruncate(File file, off_t offset);
  extern char *FilePathName(File file);
+ extern int	BufferHintIOAdvise(File file, char *offset, off_t amount, char *strategy);
  
  /* Operations that allow use of regular stdio --- USE WITH CAUTION */
  extern FILE *AllocateFile(const char *name, const char *mode);
***************
*** 113,118 **** extern int	pg_fsync_no_writethrough(int fd);
--- 115,121 ----
  extern int	pg_fsync_writethrough(int fd);
  extern int	pg_fdatasync(int fd);
  extern int	pg_flush_data(int fd, off_t offset, off_t amount);
+ extern int	pg_fadvise(int fd, off_t offset, off_t amount, int advise);
  extern void fsync_fname(char *fname, bool isdir);
  
  /* Filename components for OpenTemporaryFile */
*** a/src/include/storage/smgr.h
--- b/src/include/storage/smgr.h
***************
*** 92,98 **** extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
  extern void smgrprefetch(SMgrRelation reln, ForkNumber forknum,
  			 BlockNumber blocknum);
  extern void smgrread(SMgrRelation reln, ForkNumber forknum,
! 		 BlockNumber blocknum, char *buffer);
  extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
  		  BlockNumber blocknum, char *buffer, bool skipFsync);
  extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
--- 92,98 ----
  extern void smgrprefetch(SMgrRelation reln, ForkNumber forknum,
  			 BlockNumber blocknum);
  extern void smgrread(SMgrRelation reln, ForkNumber forknum,
! 			BlockNumber blocknum, char *buffer, char *strategy);
  extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
  		  BlockNumber blocknum, char *buffer, bool skipFsync);
  extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
***************
*** 118,124 **** extern void mdextend(SMgrRelation reln, ForkNumber forknum,
  extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
  		   BlockNumber blocknum);
  extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
! 	   char *buffer);
  extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
  		BlockNumber blocknum, char *buffer, bool skipFsync);
  extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
--- 118,124 ----
  extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
  		   BlockNumber blocknum);
  extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
! 	   char *buffer, char *strategy);
  extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
  		BlockNumber blocknum, char *buffer, bool skipFsync);
  extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
*** a/src/test/regress/expected/rangefuncs.out
--- b/src/test/regress/expected/rangefuncs.out
***************
*** 1,18 ****
  SELECT name, setting FROM pg_settings WHERE name LIKE 'enable%';
!          name         | setting 
! ----------------------+---------
!  enable_bitmapscan    | on
!  enable_hashagg       | on
!  enable_hashjoin      | on
!  enable_indexonlyscan | on
!  enable_indexscan     | on
!  enable_material      | on
!  enable_mergejoin     | on
!  enable_nestloop      | on
!  enable_seqscan       | on
!  enable_sort          | on
!  enable_tidscan       | on
! (11 rows)
  
  CREATE TABLE foo2(fooid int, f2 int);
  INSERT INTO foo2 VALUES(1, 11);
--- 1,19 ----
  SELECT name, setting FROM pg_settings WHERE name LIKE 'enable%';
!           name           | setting 
! -------------------------+---------
!  enable_bitmapscan       | on
!  enable_hashagg          | on
!  enable_hashjoin         | on
!  enable_indexonlyscan    | on
!  enable_indexscan        | on
!  enable_kernel_readahead | on
!  enable_material         | on
!  enable_mergejoin        | on
!  enable_nestloop         | on
!  enable_seqscan          | on
!  enable_sort             | on
!  enable_tidscan          | on
! (12 rows)
  
  CREATE TABLE foo2(fooid int, f2 int);
  INSERT INTO foo2 VALUES(1, 11);

<<attachment: s=1000-try1.png>>

<<attachment: s=2000-try1.png>>

<<attachment: s=3000-try1.png>>

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to