On Fri, Oct 28, 2022 at 7:39 AM Michael Paquier <mich...@paquier.xyz> wrote: > > On Thu, Oct 27, 2022 at 03:58:25PM -0700, Andres Freund wrote: > > The block sizes don't need to match, do they? As long as the block is properly > > aligned, we can change the iov_len of the final iov to match whatever the size > > is being passed in, no? > > Hmm. Based on what Bharath has written upthread, it does not seem to > matter if the size of the aligned block changes, either: > https://www.postgresql.org/message-id/calj2acuccjr7kbkqwosqmqh1zgedyj7hh5ef+dohcv7+kon...@mail.gmail.com > > I am honestly not sure whether it is a good idea to make file_utils.c > depend on one of the compile-time page sizes in this routine, be it > the page size of the WAL page size, as pg_write_zeros() would be used > for some rather low-level operations. But we could as well just use a > locally-defined structure with a buffer at 4kB or 8kB and call it a > day?
+1. Please see the attached v8 patch. -- Bharath Rupireddy PostgreSQL Contributors Team RDS Open Source Databases Amazon Web Services: https://aws.amazon.com
From 88d67c6e3e92a62ac4c783a137759b2f3795cb21 Mon Sep 17 00:00:00 2001 From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com> Date: Fri, 28 Oct 2022 04:44:41 +0000 Subject: [PATCH v8] Use pg_pwritev_with_retry() instead of write() in walmethods.c Use pg_pwritev_with_retry() while prepadding a WAL segment instead of write() in walmethods.c dir_open_for_write() to avoid partial writes. As the pg_pwritev_with_retry() function uses pwritev, we can avoid explicit lseek() on non-Windows platforms to seek to the beginning of the WAL segment. It looks like on Windows, we need an explicit lseek() call here despite using pwrite() implementation from win32pwrite.c. Otherwise an error occurs. These changes are inline with how core postgres initializes the WAL segment in XLogFileInitInternal(). Author: Bharath Rupireddy Reviewed-by: Nathan Bossart, Michael Paquier Reviewed-by: Thomas Munro, Andres Freund Discussion: https://www.postgresql.org/message-id/CALj2ACUq7nAb7%3DbJNbK3yYmp-SZhJcXFR_pLk8un6XgDzDF3OA%40mail.gmail.com --- src/backend/access/transam/xlog.c | 33 ++--------- src/bin/pg_basebackup/walmethods.c | 26 +++++---- src/common/file_utils.c | 92 ++++++++++++++++++++++++++++++ src/include/common/file_utils.h | 3 + 4 files changed, 114 insertions(+), 40 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 8f10effe3a..15dd5ce834 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2921,7 +2921,6 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, bool *added, char *path) { char tmppath[MAXPGPATH]; - PGAlignedXLogBlock zbuffer; XLogSegNo installed_segno; XLogSegNo max_segno; int fd; @@ -2965,14 +2964,11 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tmppath))); - memset(zbuffer.data, 0, XLOG_BLCKSZ); - pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE); save_errno = 0; if (wal_init_zero) { - struct iovec iov[PG_IOV_MAX]; - int blocks; + ssize_t rc; /* * Zero-fill the file. With this setting, we do this the hard way to @@ -2983,29 +2979,10 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, * indirect blocks are down on disk. Therefore, fdatasync(2) or * O_DSYNC will be sufficient to sync future writes to the log file. */ + rc = pg_pwrite_zeros(fd, wal_segment_size); - /* Prepare to write out a lot of copies of our zero buffer at once. */ - for (int i = 0; i < lengthof(iov); ++i) - { - iov[i].iov_base = zbuffer.data; - iov[i].iov_len = XLOG_BLCKSZ; - } - - /* Loop, writing as many blocks as we can for each system call. */ - blocks = wal_segment_size / XLOG_BLCKSZ; - for (int i = 0; i < blocks;) - { - int iovcnt = Min(blocks - i, lengthof(iov)); - off_t offset = i * XLOG_BLCKSZ; - - if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0) - { - save_errno = errno; - break; - } - - i += iovcnt; - } + if (rc < 0) + save_errno = errno; } else { @@ -3014,7 +2991,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, * enough. */ errno = 0; - if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1) + if (pg_pwrite(fd, "\0", 1, wal_segment_size - 1) != 1) { /* if write didn't set errno, assume no disk space */ save_errno = errno ? errno : ENOSPC; diff --git a/src/bin/pg_basebackup/walmethods.c b/src/bin/pg_basebackup/walmethods.c index bc2e83d02b..18bad52c96 100644 --- a/src/bin/pg_basebackup/walmethods.c +++ b/src/bin/pg_basebackup/walmethods.c @@ -220,22 +220,24 @@ dir_open_for_write(WalWriteMethod *wwmethod, const char *pathname, /* Do pre-padding on non-compressed files */ if (pad_to_size && wwmethod->compression_algorithm == PG_COMPRESSION_NONE) { - PGAlignedXLogBlock zerobuf; - int bytes; + ssize_t rc; - memset(zerobuf.data, 0, XLOG_BLCKSZ); - for (bytes = 0; bytes < pad_to_size; bytes += XLOG_BLCKSZ) + rc = pg_pwrite_zeros(fd, pad_to_size); + + if (rc < 0) { - errno = 0; - if (write(fd, zerobuf.data, XLOG_BLCKSZ) != XLOG_BLCKSZ) - { - /* If write didn't set errno, assume problem is no disk space */ - wwmethod->lasterrno = errno ? errno : ENOSPC; - close(fd); - return NULL; - } + wwmethod->lasterrno = errno; + close(fd); + return NULL; } + /* + * pg_pwrite() (called via pg_pwrite_zeros()) might move the file + * position on some platforms (see win32pwrite.c). We want to + * explicitly reset the file position in a platform-independent manner + * for extensibility even though it costs an extra system call on the + * platforms where the file position is guaranteed to not change. + */ if (lseek(fd, 0, SEEK_SET) != 0) { wwmethod->lasterrno = errno; diff --git a/src/common/file_utils.c b/src/common/file_utils.c index eac05a13ed..a115a3fd94 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -30,6 +30,27 @@ #endif #include "port/pg_iovec.h" +/* + * We use MAXALIGN'ed write buffer to not cause problems on alignment-picky + * hardware. See comments around PGAlignedBlock and PGAlignedXLogBlock for more + * details. + */ + +#define PG_WRITE_BLCKSZ 8192 + +typedef union PGAlignedWriteBlock +{ + /* + * XXX: Writing more than one block of size PG_WRITE_BLCKSZ bytes per might + * improve write performance on some platforms. However, no such evidence + * found so far with varying block sizes. Hence we stick to one + * PG_WRITE_BLCKSZ block for now. + */ + char data[PG_WRITE_BLCKSZ]; + double force_align_d; + int64 force_align_i64; +} PGAlignedWriteBlock; + #ifdef FRONTEND /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ @@ -527,3 +548,74 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) return sum; } + +/* + * pg_pwrite_zeros + * + * Writes zeros to a given file. Input parameters are "fd" (file descriptor of + * the file), "size" (size of the file in bytes). + * + * On failure, a negative value is returned and errno is set appropriately so + * that the caller can use it accordingly. + */ +ssize_t +pg_pwrite_zeros(int fd, size_t size) +{ + PGAlignedWriteBlock zbuffer; + struct iovec iov[PG_IOV_MAX]; + int blocks; + size_t remaining_size = 0; + int i; + ssize_t written; + ssize_t total_written = 0; + + /* Zero-fill the passed in buffer. */ + memset(zbuffer.data, 0, PG_WRITE_BLCKSZ); + + /* Prepare to write out a lot of copies of our zero buffer at once. */ + for (i = 0; i < lengthof(iov); ++i) + { + iov[i].iov_base = zbuffer.data; + iov[i].iov_len = PG_WRITE_BLCKSZ; + } + + /* Loop, writing as many blocks as we can for each system call. */ + blocks = size / PG_WRITE_BLCKSZ; + remaining_size = size % PG_WRITE_BLCKSZ; + for (i = 0; i < blocks;) + { + int iovcnt = Min(blocks - i, lengthof(iov)); + off_t offset = i * PG_WRITE_BLCKSZ; + + written = pg_pwritev_with_retry(fd, iov, iovcnt, offset); + + if (written < 0) + return written; + + i += iovcnt; + total_written += written; + } + + /* Now, write the remaining size, if any, of the file with zeros. */ + if (remaining_size > 0) + { + /* We'll never write more than one block here */ + int iovcnt = 1; + + /* Jump on to the end of previously written blocks */ + off_t offset = i * PG_WRITE_BLCKSZ; + + iov[0].iov_len = remaining_size; + + written = pg_pwritev_with_retry(fd, iov, iovcnt, offset); + + if (written < 0) + return written; + + total_written += written; + } + + Assert(total_written == size); + + return total_written; +} diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index 2c5dbcb0b1..52d5833c03 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -44,4 +44,7 @@ extern ssize_t pg_pwritev_with_retry(int fd, int iovcnt, off_t offset); +extern ssize_t pg_pwrite_zeros(int fd, + size_t size); + #endif /* FILE_UTILS_H */ -- 2.34.1