On Mon, Jan 18, 2010 at 4:35 PM, Greg Stark <gsst...@mit.edu> wrote:
> Looking at this patch for the commitfest I have a few questions.

So I've touched this patch up a bit:

1) moved the posix_fadvise call to a new fd.c function
pg_fsync_start(fd,offset,nbytes) which initiates an fsync without
waiting on it. Currently it's only implemented with
posix_fadvise(DONT_NEED) but I want to look into using sync_file_range
in the future -- it looks like this call might be good enough for our
checkpoints.

2) advised each 64k chunk as we write it which should avoid poisoning
the cache if you do a large create database on an active system.

3) added the promised but afaict missing fsync of the directory -- i
think we should actually backpatch this.

Barring any objections shall I commit it like this?


-- 
greg


-- 
greg
*** a/src/backend/storage/file/fd.c
--- b/src/backend/storage/file/fd.c
***************
*** 319,324 **** pg_fdatasync(int fd)
--- 319,340 ----
  		return 0;
  }
  
+ int
+ pg_fsync_start(int fd, off_t offset, off_t amount)
+ {
+ 	if (enableFsync)
+ 	{
+ #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+ 		return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED);
+ #else
+ 		return 0;
+ #endif
+ 	}
+ 	else
+ 		return 0;
+ }
+ 
+ 
  /*
   * InitFileAccess --- initialize this module during backend startup
   *
*** a/src/include/storage/fd.h
--- b/src/include/storage/fd.h
***************
*** 98,103 **** extern int	pg_fsync(int fd);
--- 98,104 ----
  extern int	pg_fsync_no_writethrough(int fd);
  extern int	pg_fsync_writethrough(int fd);
  extern int	pg_fdatasync(int fd);
+ extern int	pg_fsync_start(int fd, off_t offset, off_t amount);
  
  /* Filename components for OpenTemporaryFile */
  #define PG_TEMP_FILES_DIR "pgsql_tmp"
*** a/src/port/copydir.c
--- b/src/port/copydir.c
***************
*** 37,42 ****
--- 37,43 ----
  
  
  static void copy_file(char *fromfile, char *tofile);
+ static void fsync_fname(char *fname);
  
  
  /*
***************
*** 64,69 **** copydir(char *fromdir, char *todir, bool recurse)
--- 65,73 ----
  				(errcode_for_file_access(),
  				 errmsg("could not open directory \"%s\": %m", fromdir)));
  
+ 	/*
+ 	 * Copy all the files
+ 	 */
  	while ((xlde = ReadDir(xldir, fromdir)) != NULL)
  	{
  		struct stat fst;
***************
*** 89,96 **** copydir(char *fromdir, char *todir, bool recurse)
--- 93,127 ----
  		else if (S_ISREG(fst.st_mode))
  			copy_file(fromfile, tofile);
  	}
+ 	FreeDir(xldir);
+ 
+ 	/*
+ 	 * Be paranoid here and fsync all files to ensure we catch problems.
+ 	 */
+ 	xldir = AllocateDir(fromdir);
+ 	if (xldir == NULL)
+ 		ereport(ERROR,
+ 				(errcode_for_file_access(),
+ 				 errmsg("could not open directory \"%s\": %m", fromdir)));
+ 
+ 	while ((xlde = ReadDir(xldir, fromdir)) != NULL)
+ 	{
+ 		if (strcmp(xlde->d_name, ".") == 0 ||
+ 			strcmp(xlde->d_name, "..") == 0)
+ 			continue;
  
+ 		snprintf(tofile, MAXPGPATH, "%s/%s", todir, xlde->d_name);
+ 		fsync_fname(tofile);
+ 	}
  	FreeDir(xldir);
+ 
+ 	/* It's important to fsync the destination directory itself as
+ 	 * individual file fsyncs don't guarantee that the directory entry
+ 	 * for the file is synced. Recent versions of ext4 have made the
+ 	 * window much wider but it's been true for ext3 and other
+ 	 * filesyetems in the past 
+ 	 */
+ 	fsync_fname(todir);
  }
  
  /*
***************
*** 103,108 **** copy_file(char *fromfile, char *tofile)
--- 134,140 ----
  	int			srcfd;
  	int			dstfd;
  	int			nbytes;
+ 	off_t		offset;
  
  	/* Use palloc to ensure we get a maxaligned buffer */
  #define COPY_BUF_SIZE (8 * BLCKSZ)
***************
*** 128,134 **** copy_file(char *fromfile, char *tofile)
  	/*
  	 * Do the data copying.
  	 */
! 	for (;;)
  	{
  		nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
  		if (nbytes < 0)
--- 160,166 ----
  	/*
  	 * Do the data copying.
  	 */
! 	for (offset=0; ; offset+=nbytes)
  	{
  		nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
  		if (nbytes < 0)
***************
*** 147,161 **** copy_file(char *fromfile, char *tofile)
  					(errcode_for_file_access(),
  					 errmsg("could not write to file \"%s\": %m", tofile)));
  		}
- 	}
  
! 	/*
! 	 * Be paranoid here to ensure we catch problems.
! 	 */
! 	if (pg_fsync(dstfd) != 0)
! 		ereport(ERROR,
! 				(errcode_for_file_access(),
! 				 errmsg("could not fsync file \"%s\": %m", tofile)));
  
  	if (close(dstfd))
  		ereport(ERROR,
--- 179,195 ----
  					(errcode_for_file_access(),
  					 errmsg("could not write to file \"%s\": %m", tofile)));
  		}
  
! 		/*
! 		 * Avoid poisoning the cache with the entire database just
! 		 * like we do for large sequential scans. Actually that's a
! 		 * white lie, the real motivation for this is that it might
! 		 * urge the kernel to flush these buffers when convenient
! 		 * before we get to the later fsyncs. Empirical results seem
! 		 * to show a big speed benefit on recent Linux kernels.
! 		 */
! 		pg_fsync_start(dstfd, offset, nbytes);
! 	}
  
  	if (close(dstfd))
  		ereport(ERROR,
***************
*** 166,168 **** copy_file(char *fromfile, char *tofile)
--- 200,224 ----
  
  	pfree(buffer);
  }
+ 
+ /*
+  * fsync a file
+  */
+ static void
+ fsync_fname(char *fname)
+ {
+ 	int	fd = BasicOpenFile(fname, 
+ 						   O_RDONLY | PG_BINARY,
+ 						   S_IRUSR | S_IWUSR);
+ 
+ 	if (fd < 0)
+ 		ereport(ERROR,
+ 				(errcode_for_file_access(),
+ 				 errmsg("could not open file \"%s\": %m", fname)));
+ 
+ 	if (pg_fsync(fd) != 0)
+ 		ereport(ERROR,
+ 				(errcode_for_file_access(),
+ 				 errmsg("could not fsync file \"%s\": %m", fname)));
+ 	close(fd);
+ }
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to