On Mon, Jan 18, 2010 at 4:35 PM, Greg Stark <[email protected]> wrote:
> Looking at this patch for the commitfest I have a few questions.
So I've touched this patch up a bit:
1) moved the posix_fadvise call to a new fd.c function
pg_fsync_start(fd,offset,nbytes) which initiates an fsync without
waiting on it. Currently it's only implemented with
posix_fadvise(DONT_NEED) but I want to look into using sync_file_range
in the future -- it looks like this call might be good enough for our
checkpoints.
2) advised each 64k chunk as we write it which should avoid poisoning
the cache if you do a large create database on an active system.
3) added the promised but afaict missing fsync of the directory -- i
think we should actually backpatch this.
Barring any objections shall I commit it like this?
--
greg
--
greg
*** a/src/backend/storage/file/fd.c
--- b/src/backend/storage/file/fd.c
***************
*** 319,324 **** pg_fdatasync(int fd)
--- 319,340 ----
return 0;
}
+ int
+ pg_fsync_start(int fd, off_t offset, off_t amount)
+ {
+ if (enableFsync)
+ {
+ #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+ return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED);
+ #else
+ return 0;
+ #endif
+ }
+ else
+ return 0;
+ }
+
+
/*
* InitFileAccess --- initialize this module during backend startup
*
*** a/src/include/storage/fd.h
--- b/src/include/storage/fd.h
***************
*** 98,103 **** extern int pg_fsync(int fd);
--- 98,104 ----
extern int pg_fsync_no_writethrough(int fd);
extern int pg_fsync_writethrough(int fd);
extern int pg_fdatasync(int fd);
+ extern int pg_fsync_start(int fd, off_t offset, off_t amount);
/* Filename components for OpenTemporaryFile */
#define PG_TEMP_FILES_DIR "pgsql_tmp"
*** a/src/port/copydir.c
--- b/src/port/copydir.c
***************
*** 37,42 ****
--- 37,43 ----
static void copy_file(char *fromfile, char *tofile);
+ static void fsync_fname(char *fname);
/*
***************
*** 64,69 **** copydir(char *fromdir, char *todir, bool recurse)
--- 65,73 ----
(errcode_for_file_access(),
errmsg("could not open directory \"%s\": %m", fromdir)));
+ /*
+ * Copy all the files
+ */
while ((xlde = ReadDir(xldir, fromdir)) != NULL)
{
struct stat fst;
***************
*** 89,96 **** copydir(char *fromdir, char *todir, bool recurse)
--- 93,127 ----
else if (S_ISREG(fst.st_mode))
copy_file(fromfile, tofile);
}
+ FreeDir(xldir);
+
+ /*
+ * Be paranoid here and fsync all files to ensure we catch problems.
+ */
+ xldir = AllocateDir(fromdir);
+ if (xldir == NULL)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open directory \"%s\": %m", fromdir)));
+
+ while ((xlde = ReadDir(xldir, fromdir)) != NULL)
+ {
+ if (strcmp(xlde->d_name, ".") == 0 ||
+ strcmp(xlde->d_name, "..") == 0)
+ continue;
+ snprintf(tofile, MAXPGPATH, "%s/%s", todir, xlde->d_name);
+ fsync_fname(tofile);
+ }
FreeDir(xldir);
+
+ /* It's important to fsync the destination directory itself as
+ * individual file fsyncs don't guarantee that the directory entry
+ * for the file is synced. Recent versions of ext4 have made the
+ * window much wider but it's been true for ext3 and other
+ * filesyetems in the past
+ */
+ fsync_fname(todir);
}
/*
***************
*** 103,108 **** copy_file(char *fromfile, char *tofile)
--- 134,140 ----
int srcfd;
int dstfd;
int nbytes;
+ off_t offset;
/* Use palloc to ensure we get a maxaligned buffer */
#define COPY_BUF_SIZE (8 * BLCKSZ)
***************
*** 128,134 **** copy_file(char *fromfile, char *tofile)
/*
* Do the data copying.
*/
! for (;;)
{
nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
if (nbytes < 0)
--- 160,166 ----
/*
* Do the data copying.
*/
! for (offset=0; ; offset+=nbytes)
{
nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
if (nbytes < 0)
***************
*** 147,161 **** copy_file(char *fromfile, char *tofile)
(errcode_for_file_access(),
errmsg("could not write to file \"%s\": %m", tofile)));
}
- }
! /*
! * Be paranoid here to ensure we catch problems.
! */
! if (pg_fsync(dstfd) != 0)
! ereport(ERROR,
! (errcode_for_file_access(),
! errmsg("could not fsync file \"%s\": %m", tofile)));
if (close(dstfd))
ereport(ERROR,
--- 179,195 ----
(errcode_for_file_access(),
errmsg("could not write to file \"%s\": %m", tofile)));
}
! /*
! * Avoid poisoning the cache with the entire database just
! * like we do for large sequential scans. Actually that's a
! * white lie, the real motivation for this is that it might
! * urge the kernel to flush these buffers when convenient
! * before we get to the later fsyncs. Empirical results seem
! * to show a big speed benefit on recent Linux kernels.
! */
! pg_fsync_start(dstfd, offset, nbytes);
! }
if (close(dstfd))
ereport(ERROR,
***************
*** 166,168 **** copy_file(char *fromfile, char *tofile)
--- 200,224 ----
pfree(buffer);
}
+
+ /*
+ * fsync a file
+ */
+ static void
+ fsync_fname(char *fname)
+ {
+ int fd = BasicOpenFile(fname,
+ O_RDONLY | PG_BINARY,
+ S_IRUSR | S_IWUSR);
+
+ if (fd < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", fname)));
+
+ if (pg_fsync(fd) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m", fname)));
+ close(fd);
+ }
--
Sent via pgsql-performance mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-performance